unique_toolkit 0.5.24__py3-none-any.whl → 0.5.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_toolkit/_common/exception.py +33 -0
- unique_toolkit/_common/validators.py +8 -0
- unique_toolkit/content/service.py +7 -0
- unique_toolkit/evaluators/config.py +35 -0
- unique_toolkit/evaluators/context_relevancy/constants.py +32 -0
- unique_toolkit/evaluators/context_relevancy/prompts.py +31 -0
- unique_toolkit/evaluators/context_relevancy/service.py +53 -0
- unique_toolkit/evaluators/context_relevancy/utils.py +139 -0
- unique_toolkit/evaluators/exception.py +5 -0
- unique_toolkit/evaluators/hallucination/constants.py +41 -0
- unique_toolkit/evaluators/hallucination/prompts.py +79 -0
- unique_toolkit/evaluators/hallucination/service.py +58 -0
- unique_toolkit/evaluators/hallucination/utils.py +201 -0
- unique_toolkit/evaluators/output_parser.py +30 -0
- unique_toolkit/evaluators/schemas.py +82 -0
- unique_toolkit/language_model/infos.py +50 -0
- unique_toolkit/language_model/schemas.py +13 -2
- unique_toolkit/language_model/service.py +2 -2
- {unique_toolkit-0.5.24.dist-info → unique_toolkit-0.5.28.dist-info}/METADATA +16 -2
- {unique_toolkit-0.5.24.dist-info → unique_toolkit-0.5.28.dist-info}/RECORD +22 -8
- {unique_toolkit-0.5.24.dist-info → unique_toolkit-0.5.28.dist-info}/LICENSE +0 -0
- {unique_toolkit-0.5.24.dist-info → unique_toolkit-0.5.28.dist-info}/WHEEL +0 -0
@@ -0,0 +1,33 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
|
4
|
+
class CommonException(Exception):
|
5
|
+
def __init__(
|
6
|
+
self,
|
7
|
+
user_message: str,
|
8
|
+
error_message: str,
|
9
|
+
exception: Optional[Exception] = None,
|
10
|
+
):
|
11
|
+
super().__init__(error_message)
|
12
|
+
self._user_message = user_message
|
13
|
+
self._error_message = error_message
|
14
|
+
self._exception = exception
|
15
|
+
|
16
|
+
@property
|
17
|
+
def user_message(self):
|
18
|
+
return self._user_message
|
19
|
+
|
20
|
+
@property
|
21
|
+
def error_message(self):
|
22
|
+
return self._error_message
|
23
|
+
|
24
|
+
@property
|
25
|
+
def name(self):
|
26
|
+
return self.__class__.__name__
|
27
|
+
|
28
|
+
@property
|
29
|
+
def exception(self):
|
30
|
+
return self._exception
|
31
|
+
|
32
|
+
def __str__(self):
|
33
|
+
return self._error_message
|
@@ -204,6 +204,13 @@ class ContentService(BaseService):
|
|
204
204
|
|
205
205
|
return self._map_contents(contents)
|
206
206
|
|
207
|
+
def search_content_on_chat(
|
208
|
+
self,
|
209
|
+
) -> list[Content]:
|
210
|
+
where = {"ownerId": {"equals": self.event.payload.chat_id}}
|
211
|
+
|
212
|
+
return self.search_contents(where)
|
213
|
+
|
207
214
|
@staticmethod
|
208
215
|
def _map_content_chunk(content_chunk: dict):
|
209
216
|
return ContentChunk(
|
@@ -0,0 +1,35 @@
|
|
1
|
+
from humps import camelize
|
2
|
+
from pydantic import BaseModel, ConfigDict, field_validator
|
3
|
+
|
4
|
+
from unique_toolkit._common.validators import validate_and_init_language_model
|
5
|
+
from unique_toolkit.evaluators.schemas import (
|
6
|
+
EvaluationMetricName,
|
7
|
+
)
|
8
|
+
from unique_toolkit.language_model.infos import (
|
9
|
+
LanguageModel,
|
10
|
+
LanguageModelName,
|
11
|
+
)
|
12
|
+
|
13
|
+
model_config = ConfigDict(
|
14
|
+
alias_generator=camelize,
|
15
|
+
populate_by_name=True,
|
16
|
+
arbitrary_types_allowed=True,
|
17
|
+
validate_default=True,
|
18
|
+
json_encoders={LanguageModel: lambda v: v.display_name},
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
class EvaluationMetricConfig(BaseModel):
|
23
|
+
model_config = model_config
|
24
|
+
|
25
|
+
enabled: bool = False
|
26
|
+
name: EvaluationMetricName
|
27
|
+
language_model: LanguageModel = LanguageModel(
|
28
|
+
LanguageModelName.AZURE_GPT_35_TURBO_0613
|
29
|
+
)
|
30
|
+
custom_prompts: dict[str, str] = {}
|
31
|
+
score_to_emoji: dict[str, str] = {}
|
32
|
+
|
33
|
+
@field_validator("language_model", mode="before")
|
34
|
+
def validate_language_model(cls, value: LanguageModelName | LanguageModel):
|
35
|
+
return validate_and_init_language_model(value)
|
@@ -0,0 +1,32 @@
|
|
1
|
+
from unique_toolkit.evaluators.config import EvaluationMetricConfig
|
2
|
+
from unique_toolkit.evaluators.context_relevancy.prompts import (
|
3
|
+
CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
|
4
|
+
CONTEXT_RELEVANCY_METRIC_USER_MSG,
|
5
|
+
)
|
6
|
+
from unique_toolkit.evaluators.schemas import (
|
7
|
+
EvaluationMetricName,
|
8
|
+
)
|
9
|
+
from unique_toolkit.language_model.infos import LanguageModel
|
10
|
+
from unique_toolkit.language_model.service import LanguageModelName
|
11
|
+
|
12
|
+
SYSTEM_MSG_KEY = "systemPrompt"
|
13
|
+
USER_MSG_KEY = "userPrompt"
|
14
|
+
|
15
|
+
# Required input fields for context relevancy evaluation
|
16
|
+
context_relevancy_required_input_fields = [
|
17
|
+
"input_text",
|
18
|
+
"output_text",
|
19
|
+
"context_texts",
|
20
|
+
]
|
21
|
+
|
22
|
+
|
23
|
+
default_config = EvaluationMetricConfig(
|
24
|
+
enabled=False,
|
25
|
+
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
26
|
+
language_model=LanguageModel(LanguageModelName.AZURE_GPT_35_TURBO_0613),
|
27
|
+
score_to_emoji={"LOW": "🟢", "MEDIUM": "🟡", "HIGH": "🔴"},
|
28
|
+
custom_prompts={
|
29
|
+
SYSTEM_MSG_KEY: CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
|
30
|
+
USER_MSG_KEY: CONTEXT_RELEVANCY_METRIC_USER_MSG,
|
31
|
+
},
|
32
|
+
)
|
@@ -0,0 +1,31 @@
|
|
1
|
+
CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG = """
|
2
|
+
You will receive an input and a set of contexts.
|
3
|
+
Your task is to evaluate how relevant the contexts are to the input text.
|
4
|
+
|
5
|
+
Use the following rating scale to generate a score:
|
6
|
+
[low] - The contexts are not relevant to the input.
|
7
|
+
[medium] - The contexts are somewhat relevant to the input.
|
8
|
+
[high] - The contexts are highly relevant to the input.
|
9
|
+
|
10
|
+
Your answer must be in JSON format:
|
11
|
+
{
|
12
|
+
"reason": Your explanation of your judgement of the evaluation,
|
13
|
+
"value": decision, must be one of the following ["low", "medium", "high"]
|
14
|
+
}
|
15
|
+
"""
|
16
|
+
|
17
|
+
CONTEXT_RELEVANCY_METRIC_USER_MSG = """
|
18
|
+
Here is the data:
|
19
|
+
|
20
|
+
Input:
|
21
|
+
'''
|
22
|
+
$input_text
|
23
|
+
'''
|
24
|
+
|
25
|
+
Contexts:
|
26
|
+
'''
|
27
|
+
$context_texts
|
28
|
+
'''
|
29
|
+
|
30
|
+
Answer as JSON:
|
31
|
+
"""
|
@@ -0,0 +1,53 @@
|
|
1
|
+
from logging import Logger
|
2
|
+
|
3
|
+
from unique_toolkit.app.schemas import Event
|
4
|
+
from unique_toolkit.evaluators.config import EvaluationMetricConfig
|
5
|
+
from unique_toolkit.evaluators.context_relevancy.constants import default_config
|
6
|
+
from unique_toolkit.evaluators.context_relevancy.utils import (
|
7
|
+
check_context_relevancy_async,
|
8
|
+
)
|
9
|
+
from unique_toolkit.evaluators.schemas import (
|
10
|
+
EvaluationMetricInput,
|
11
|
+
EvaluationMetricResult,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
class ContextRelevancyEvaluator:
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
event: Event,
|
19
|
+
logger: Logger,
|
20
|
+
):
|
21
|
+
self.event = event
|
22
|
+
self.logger = logger
|
23
|
+
|
24
|
+
async def run(
|
25
|
+
self,
|
26
|
+
input: EvaluationMetricInput,
|
27
|
+
config: EvaluationMetricConfig = default_config,
|
28
|
+
) -> EvaluationMetricResult | None:
|
29
|
+
"""
|
30
|
+
Analyzes the level of relevancy of a context by comparing
|
31
|
+
it with the input text.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
input (EvaluationMetricInput): The input for the metric.
|
35
|
+
config (EvaluationMetricConfig): The configuration for the metric.
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
EvaluationMetricResult | None: The result of the evaluation, indicating the level of context relevancy.
|
39
|
+
Returns None if the metric is not enabled.
|
40
|
+
|
41
|
+
Raises:
|
42
|
+
EvaluatorException: If required fields are missing or an error occurs during evaluation.
|
43
|
+
"""
|
44
|
+
if config.enabled is False:
|
45
|
+
self.logger.info("Context relevancy metric is not enabled.")
|
46
|
+
return None
|
47
|
+
|
48
|
+
return await check_context_relevancy_async(
|
49
|
+
company_id=self.event.company_id,
|
50
|
+
input=input,
|
51
|
+
config=config,
|
52
|
+
logger=self.logger,
|
53
|
+
)
|
@@ -0,0 +1,139 @@
|
|
1
|
+
import logging
|
2
|
+
from string import Template
|
3
|
+
|
4
|
+
from unique_toolkit.evaluators.config import (
|
5
|
+
EvaluationMetricConfig,
|
6
|
+
)
|
7
|
+
from unique_toolkit.evaluators.context_relevancy.constants import (
|
8
|
+
SYSTEM_MSG_KEY,
|
9
|
+
USER_MSG_KEY,
|
10
|
+
context_relevancy_required_input_fields,
|
11
|
+
)
|
12
|
+
from unique_toolkit.evaluators.context_relevancy.prompts import (
|
13
|
+
CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
|
14
|
+
CONTEXT_RELEVANCY_METRIC_USER_MSG,
|
15
|
+
)
|
16
|
+
from unique_toolkit.evaluators.exception import EvaluatorException
|
17
|
+
from unique_toolkit.evaluators.output_parser import (
|
18
|
+
parse_eval_metric_result,
|
19
|
+
)
|
20
|
+
from unique_toolkit.evaluators.schemas import (
|
21
|
+
EvaluationMetricInput,
|
22
|
+
EvaluationMetricName,
|
23
|
+
EvaluationMetricResult,
|
24
|
+
)
|
25
|
+
from unique_toolkit.language_model.schemas import (
|
26
|
+
LanguageModelMessages,
|
27
|
+
LanguageModelSystemMessage,
|
28
|
+
LanguageModelUserMessage,
|
29
|
+
)
|
30
|
+
from unique_toolkit.language_model.service import LanguageModelService
|
31
|
+
|
32
|
+
logger = logging.getLogger(__name__)
|
33
|
+
|
34
|
+
|
35
|
+
async def check_context_relevancy_async(
|
36
|
+
company_id: str,
|
37
|
+
input: EvaluationMetricInput,
|
38
|
+
config: EvaluationMetricConfig,
|
39
|
+
logger: logging.Logger = logger,
|
40
|
+
) -> EvaluationMetricResult | None:
|
41
|
+
"""
|
42
|
+
Analyzes the relevancy of the context provided for the given input and output.
|
43
|
+
The analysis classifies the context relevancy level as:
|
44
|
+
- low
|
45
|
+
- medium
|
46
|
+
- high
|
47
|
+
|
48
|
+
This method performs the following steps:
|
49
|
+
1. Logs the start of the analysis using the provided `logger`.
|
50
|
+
2. Validates the required fields in the `input` data.
|
51
|
+
3. Retrieves the messages using the `_get_msgs` method.
|
52
|
+
4. Calls `LanguageModelService.complete_async_util` to get a completion result.
|
53
|
+
5. Parses and returns the evaluation metric result based on the content of the completion result.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
company_id (str): The company ID for the analysis.
|
57
|
+
input (EvaluationMetricInput): The input data used for evaluation, including the generated output and reference information.
|
58
|
+
config (EvaluationMetricConfig): Configuration settings for the evaluation.
|
59
|
+
logger (Optional[logging.Logger], optional): The logger used for logging information and errors. Defaults to the logger for the current module.
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
EvaluationMetricResult | None: The result of the evaluation, indicating the level of context relevancy. Returns `None` if an error occurs.
|
63
|
+
|
64
|
+
Raises:
|
65
|
+
EvaluatorException: If required fields are missing or an error occurs during the evaluation.
|
66
|
+
"""
|
67
|
+
model_name = config.language_model.name
|
68
|
+
logger.info(f"Analyzing context relevancy with {model_name}.")
|
69
|
+
|
70
|
+
input.validate_required_fields(context_relevancy_required_input_fields)
|
71
|
+
|
72
|
+
if input.context_texts and len(input.context_texts) == 0:
|
73
|
+
error_message = "No context texts provided."
|
74
|
+
raise EvaluatorException(
|
75
|
+
user_message=error_message,
|
76
|
+
error_message=error_message,
|
77
|
+
)
|
78
|
+
|
79
|
+
try:
|
80
|
+
msgs = _get_msgs(input, config)
|
81
|
+
result = await LanguageModelService.complete_async_util(
|
82
|
+
company_id=company_id, messages=msgs, model_name=model_name
|
83
|
+
)
|
84
|
+
result_content = result.choices[0].message.content
|
85
|
+
if not result_content:
|
86
|
+
error_message = "Context relevancy evaluation did not return a result."
|
87
|
+
raise EvaluatorException(
|
88
|
+
error_message=error_message,
|
89
|
+
user_message=error_message,
|
90
|
+
)
|
91
|
+
return parse_eval_metric_result(
|
92
|
+
result_content, EvaluationMetricName.CONTEXT_RELEVANCY
|
93
|
+
)
|
94
|
+
except Exception as e:
|
95
|
+
error_message = "Error occurred during context relevancy metric analysis"
|
96
|
+
raise EvaluatorException(
|
97
|
+
error_message=f"{error_message}: {e}",
|
98
|
+
user_message=error_message,
|
99
|
+
exception=e,
|
100
|
+
)
|
101
|
+
|
102
|
+
|
103
|
+
def _get_msgs(
|
104
|
+
input: EvaluationMetricInput,
|
105
|
+
config: EvaluationMetricConfig,
|
106
|
+
):
|
107
|
+
"""
|
108
|
+
Composes the messages for context relevancy analysis based on the provided input and configuration.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
input (EvaluationMetricInput): The input data that includes context texts for the analysis.
|
112
|
+
config (EvaluationMetricConfig): The configuration settings for composing messages.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
LanguageModelMessages: The composed messages as per the provided input and configuration.
|
116
|
+
"""
|
117
|
+
system_msg_content = _get_system_prompt(config)
|
118
|
+
system_msg = LanguageModelSystemMessage(content=system_msg_content)
|
119
|
+
|
120
|
+
user_msg_templ = Template(_get_user_prompt(config))
|
121
|
+
user_msg_content = user_msg_templ.substitute(
|
122
|
+
input_text=input.input_text, contexts_text=input.get_joined_context_texts()
|
123
|
+
)
|
124
|
+
user_msg = LanguageModelUserMessage(content=user_msg_content)
|
125
|
+
return LanguageModelMessages([system_msg, user_msg])
|
126
|
+
|
127
|
+
|
128
|
+
def _get_system_prompt(config: EvaluationMetricConfig):
|
129
|
+
return config.custom_prompts.setdefault(
|
130
|
+
SYSTEM_MSG_KEY,
|
131
|
+
CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
|
132
|
+
)
|
133
|
+
|
134
|
+
|
135
|
+
def _get_user_prompt(config: EvaluationMetricConfig):
|
136
|
+
return config.custom_prompts.setdefault(
|
137
|
+
USER_MSG_KEY,
|
138
|
+
CONTEXT_RELEVANCY_METRIC_USER_MSG,
|
139
|
+
)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
from unique_toolkit.evaluators.config import EvaluationMetricConfig
|
2
|
+
from unique_toolkit.evaluators.hallucination.prompts import (
|
3
|
+
HALLUCINATION_METRIC_SYSTEM_MSG,
|
4
|
+
HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
|
5
|
+
HALLUCINATION_METRIC_USER_MSG,
|
6
|
+
HALLUCINATION_METRIC_USER_MSG_DEFAULT,
|
7
|
+
)
|
8
|
+
from unique_toolkit.evaluators.schemas import (
|
9
|
+
EvaluationMetricInputFieldName,
|
10
|
+
EvaluationMetricName,
|
11
|
+
)
|
12
|
+
from unique_toolkit.language_model.infos import (
|
13
|
+
LanguageModel,
|
14
|
+
LanguageModelName,
|
15
|
+
)
|
16
|
+
|
17
|
+
SYSTEM_MSG_KEY = "systemPrompt"
|
18
|
+
USER_MSG_KEY = "userPrompt"
|
19
|
+
SYSTEM_MSG_DEFAULT_KEY = "systemPromptDefault"
|
20
|
+
USER_MSG_DEFAULT_KEY = "userPromptDefault"
|
21
|
+
|
22
|
+
|
23
|
+
hallucination_metric_default_config = EvaluationMetricConfig(
|
24
|
+
enabled=False,
|
25
|
+
name=EvaluationMetricName.HALLUCINATION,
|
26
|
+
language_model=LanguageModel(LanguageModelName.AZURE_GPT_4_0613),
|
27
|
+
score_to_emoji={"LOW": "🟢", "MEDIUM": "🟡", "HIGH": "🔴"},
|
28
|
+
custom_prompts={
|
29
|
+
SYSTEM_MSG_KEY: HALLUCINATION_METRIC_SYSTEM_MSG,
|
30
|
+
USER_MSG_KEY: HALLUCINATION_METRIC_USER_MSG,
|
31
|
+
SYSTEM_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
|
32
|
+
USER_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_USER_MSG_DEFAULT,
|
33
|
+
},
|
34
|
+
)
|
35
|
+
|
36
|
+
hallucination_required_input_fields = [
|
37
|
+
EvaluationMetricInputFieldName.INPUT_TEXT,
|
38
|
+
EvaluationMetricInputFieldName.CONTEXT_TEXTS,
|
39
|
+
EvaluationMetricInputFieldName.HISTORY_MESSAGES,
|
40
|
+
EvaluationMetricInputFieldName.OUTPUT_TEXT,
|
41
|
+
]
|
@@ -0,0 +1,79 @@
|
|
1
|
+
HALLUCINATION_METRIC_SYSTEM_MSG = """
|
2
|
+
You will receive a question, references, a conversation between a user and an agent, and an output.
|
3
|
+
The output is the answer to the question.
|
4
|
+
Your task is to evaluate if the output is fully supported by the information provided in the references and conversation, and provide explanations on your judgement in 2 sentences.
|
5
|
+
|
6
|
+
Use the following entailment scale to generate a score:
|
7
|
+
[low] - All information in output is supported by the references/conversation, or extractions from the references/conversation.
|
8
|
+
[medium] - The output is supported by the references/conversation to some extent, but there is at least some information in the output that is not discussed in the references/conversation. For example, if an instruction asks about two concepts and the references/conversation only discusses either of them, it should be considered a [medium] hallucination level.
|
9
|
+
[high] - The output contains information that is not part of the references/conversation, is unrelated to the references/conversation, or contradicts the references/conversation.
|
10
|
+
|
11
|
+
Make sure to not use any external information/knowledge to judge whether the output is true or not. Only check whether the output is supported by the references/conversation, and not whether the output is correct or not. Also do not evaluate if the references/conversation contain further information that is not part of the output but could be relevant to the qestion.
|
12
|
+
|
13
|
+
Your answer must be in JSON format:
|
14
|
+
{
|
15
|
+
"reason": Your explanation of your judgement of the evaluation,
|
16
|
+
"value": decision, must be one of the following: ["high", "medium", "low"]
|
17
|
+
}
|
18
|
+
"""
|
19
|
+
|
20
|
+
HALLUCINATION_METRIC_USER_MSG = """
|
21
|
+
Here is the data:
|
22
|
+
|
23
|
+
Input:
|
24
|
+
'''
|
25
|
+
$input_text
|
26
|
+
'''
|
27
|
+
|
28
|
+
References:
|
29
|
+
'''
|
30
|
+
$contexts_text
|
31
|
+
'''
|
32
|
+
|
33
|
+
Conversation:
|
34
|
+
'''
|
35
|
+
$history_messages_text
|
36
|
+
'''
|
37
|
+
|
38
|
+
Output:
|
39
|
+
'''
|
40
|
+
$output_text
|
41
|
+
'''
|
42
|
+
|
43
|
+
Answer as JSON:
|
44
|
+
"""
|
45
|
+
|
46
|
+
HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT = """
|
47
|
+
You will receive a question and an output.
|
48
|
+
The output is the answer to the question.
|
49
|
+
The situation is that no references could be found to answer the question. Your task is to evaluate if the output contains any information to answer the question,
|
50
|
+
and provide a short explanations of your reasoning in 2 sentences. Also mention in your explanation that no references were provided to answer the question.
|
51
|
+
|
52
|
+
Use the following entailment scale to generate a score:
|
53
|
+
[low] - The output does not contain any information to answer the question.
|
54
|
+
[medium] - The output contains some information to answer the question, but does not answer the question entirely.
|
55
|
+
[high] - The output answers the question.
|
56
|
+
|
57
|
+
It is not considered an answer when the output relates to the questions subject. Make sure to not use any external information/knowledge to judge whether the output is true or not. Only check that the output does not answer the question, and not whether the output is correct or not.
|
58
|
+
Your answer must be in JSON format:
|
59
|
+
{
|
60
|
+
"reason": Your explanation of your reasoning of the evaluation,
|
61
|
+
"value": decision, must be one of the following: ["low", "medium", "high"]
|
62
|
+
}
|
63
|
+
"""
|
64
|
+
|
65
|
+
HALLUCINATION_METRIC_USER_MSG_DEFAULT = """
|
66
|
+
Here is the data:
|
67
|
+
|
68
|
+
Input:
|
69
|
+
'''
|
70
|
+
$input_text
|
71
|
+
'''
|
72
|
+
|
73
|
+
Output:
|
74
|
+
'''
|
75
|
+
$output_text
|
76
|
+
'''
|
77
|
+
|
78
|
+
Answer as JSON:
|
79
|
+
"""
|
@@ -0,0 +1,58 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from unique_toolkit.app.schemas import Event
|
4
|
+
from unique_toolkit.evaluators.config import (
|
5
|
+
EvaluationMetricConfig,
|
6
|
+
)
|
7
|
+
from unique_toolkit.evaluators.hallucination.constants import (
|
8
|
+
hallucination_metric_default_config,
|
9
|
+
)
|
10
|
+
from unique_toolkit.evaluators.hallucination.utils import check_hallucination_async
|
11
|
+
from unique_toolkit.evaluators.schemas import (
|
12
|
+
EvaluationMetricInput,
|
13
|
+
EvaluationMetricResult,
|
14
|
+
)
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class HallucinationEvaluator:
|
20
|
+
def __init__(self, event: Event, logger: logging.Logger = logger):
|
21
|
+
self.event = event
|
22
|
+
self.logger = logger
|
23
|
+
|
24
|
+
async def run(
|
25
|
+
self,
|
26
|
+
input: EvaluationMetricInput,
|
27
|
+
config: EvaluationMetricConfig = hallucination_metric_default_config,
|
28
|
+
) -> EvaluationMetricResult | None:
|
29
|
+
"""
|
30
|
+
Analyzes the level of hallucination in the generated output by comparing it with the input
|
31
|
+
and the provided contexts or history. The analysis classifies the hallucination level as:
|
32
|
+
- low
|
33
|
+
- medium
|
34
|
+
- high
|
35
|
+
|
36
|
+
If no contexts or history are referenced in the generated output, the method verifies
|
37
|
+
that the output does not contain any relevant information to answer the question.
|
38
|
+
|
39
|
+
This method calls `check_hallucination_async` to perform the actual analysis. The `check_hallucination_async`
|
40
|
+
function handles the evaluation using the company ID from the event, the provided input, and the configuration.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
input (EvaluationMetricInput): The input data used for evaluation, including the generated output and reference information.
|
44
|
+
config (EvaluationMetricConfig, optional): Configuration settings for the evaluation. Defaults to `hallucination_metric_default_config`.
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
EvaluationMetricResult | None: The result of the evaluation, indicating the level of hallucination. Returns `None` if the analysis cannot be performed.
|
48
|
+
|
49
|
+
Raises:
|
50
|
+
EvaluatorException: If the context texts are empty, required fields are missing, or an error occurs during the evaluation.
|
51
|
+
"""
|
52
|
+
if config.enabled is False:
|
53
|
+
self.logger.info("Hallucination metric is not enabled.")
|
54
|
+
return None
|
55
|
+
|
56
|
+
return await check_hallucination_async(
|
57
|
+
company_id=self.event.company_id, input=input, config=config
|
58
|
+
)
|
@@ -0,0 +1,201 @@
|
|
1
|
+
import logging
|
2
|
+
from string import Template
|
3
|
+
|
4
|
+
from unique_toolkit.evaluators.config import (
|
5
|
+
EvaluationMetricConfig,
|
6
|
+
)
|
7
|
+
from unique_toolkit.evaluators.exception import EvaluatorException
|
8
|
+
from unique_toolkit.evaluators.hallucination.constants import (
|
9
|
+
SYSTEM_MSG_DEFAULT_KEY,
|
10
|
+
SYSTEM_MSG_KEY,
|
11
|
+
USER_MSG_DEFAULT_KEY,
|
12
|
+
USER_MSG_KEY,
|
13
|
+
hallucination_required_input_fields,
|
14
|
+
)
|
15
|
+
from unique_toolkit.evaluators.output_parser import (
|
16
|
+
parse_eval_metric_result,
|
17
|
+
)
|
18
|
+
from unique_toolkit.evaluators.schemas import (
|
19
|
+
EvaluationMetricInput,
|
20
|
+
EvaluationMetricName,
|
21
|
+
EvaluationMetricResult,
|
22
|
+
)
|
23
|
+
from unique_toolkit.language_model.schemas import (
|
24
|
+
LanguageModelMessages,
|
25
|
+
LanguageModelSystemMessage,
|
26
|
+
LanguageModelUserMessage,
|
27
|
+
)
|
28
|
+
from unique_toolkit.language_model.service import LanguageModelService
|
29
|
+
|
30
|
+
from .prompts import (
|
31
|
+
HALLUCINATION_METRIC_SYSTEM_MSG,
|
32
|
+
HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
|
33
|
+
HALLUCINATION_METRIC_USER_MSG,
|
34
|
+
HALLUCINATION_METRIC_USER_MSG_DEFAULT,
|
35
|
+
)
|
36
|
+
|
37
|
+
logger = logging.getLogger(__name__)
|
38
|
+
|
39
|
+
|
40
|
+
async def check_hallucination_async(
|
41
|
+
company_id: str,
|
42
|
+
input: EvaluationMetricInput,
|
43
|
+
config: EvaluationMetricConfig,
|
44
|
+
logger: logging.Logger = logger,
|
45
|
+
) -> EvaluationMetricResult | None:
|
46
|
+
"""
|
47
|
+
Analyzes the level of hallucination in the generated output by comparing it with the provided input
|
48
|
+
and the contexts or history. The analysis classifies the hallucination level as:
|
49
|
+
- low
|
50
|
+
- medium
|
51
|
+
- high
|
52
|
+
|
53
|
+
If no contexts or history are referenced in the generated output, the method checks that the output
|
54
|
+
does not contain any relevant information to answer the question.
|
55
|
+
|
56
|
+
This method performs the following steps:
|
57
|
+
1. Checks if the hallucination metric is enabled using the provided `config`.
|
58
|
+
2. Logs the start of the analysis using the provided `logger`.
|
59
|
+
3. Validates the required fields in the `input` data.
|
60
|
+
4. Retrieves the messages using the `_get_msgs` method.
|
61
|
+
5. Calls `LanguageModelService.complete_async_util` to get a completion result.
|
62
|
+
6. Parses and returns the evaluation metric result based on the content of the completion result.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
company_id (str): The company ID for the analysis.
|
66
|
+
input (EvaluationMetricInput): The input data used for evaluation, including the generated output and reference information.
|
67
|
+
config (EvaluationMetricConfig, optional): Configuration settings for the evaluation. Defaults to `hallucination_metric_default_config`.
|
68
|
+
logger (Optional[logging.Logger], optional): The logger used for logging information and errors. Defaults to the logger for the current module.
|
69
|
+
|
70
|
+
Returns:
|
71
|
+
EvaluationMetricResult | None: The result of the evaluation, indicating the level of hallucination. Returns `None` if the metric is not enabled or if an error occurs.
|
72
|
+
|
73
|
+
Raises:
|
74
|
+
EvaluatorException: If the context texts are empty, required fields are missing, or an error occurs during the evaluation.
|
75
|
+
"""
|
76
|
+
model_name = config.language_model.name
|
77
|
+
logger.info(f"Analyzing level of hallucination with {model_name}.")
|
78
|
+
|
79
|
+
input.validate_required_fields(hallucination_required_input_fields)
|
80
|
+
|
81
|
+
try:
|
82
|
+
msgs = _get_msgs(input, config, logger)
|
83
|
+
result = await LanguageModelService.complete_async_util(
|
84
|
+
company_id=company_id, messages=msgs, model_name=model_name
|
85
|
+
)
|
86
|
+
result_content = result.choices[0].message.content
|
87
|
+
if not result_content:
|
88
|
+
error_message = "Hallucination evaluation did not return a result."
|
89
|
+
raise EvaluatorException(
|
90
|
+
error_message=error_message,
|
91
|
+
user_message=error_message,
|
92
|
+
)
|
93
|
+
return parse_eval_metric_result(
|
94
|
+
result_content, EvaluationMetricName.HALLUCINATION
|
95
|
+
)
|
96
|
+
except Exception as e:
|
97
|
+
error_message = "Error occurred during hallucination metric analysis"
|
98
|
+
raise EvaluatorException(
|
99
|
+
error_message=f"{error_message}: {e}",
|
100
|
+
user_message=error_message,
|
101
|
+
exception=e,
|
102
|
+
)
|
103
|
+
|
104
|
+
|
105
|
+
def _get_msgs(
|
106
|
+
input: EvaluationMetricInput,
|
107
|
+
config: EvaluationMetricConfig,
|
108
|
+
logger: logging.Logger,
|
109
|
+
):
|
110
|
+
"""
|
111
|
+
Composes the messages for hallucination analysis based on the provided input and configuration.
|
112
|
+
|
113
|
+
This method decides how to compose the messages based on the availability of context texts and history
|
114
|
+
message texts in the `input`
|
115
|
+
|
116
|
+
Args:
|
117
|
+
input (EvaluationMetricInput): The input data that includes context texts and history message texts
|
118
|
+
for the analysis.
|
119
|
+
config (EvaluationMetricConfig): The configuration settings for composing messages.
|
120
|
+
logger (Optional[logging.Logger], optional): The logger used for logging debug information.
|
121
|
+
Defaults to the logger for the current module.
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
The composed messages as per the provided input and configuration. The exact type and structure
|
125
|
+
depend on the implementation of the `compose_msgs` and `compose_msgs_default` methods.
|
126
|
+
|
127
|
+
"""
|
128
|
+
if input.context_texts or input.history_messages:
|
129
|
+
logger.debug("Using context / history for hallucination evaluation.")
|
130
|
+
return _compose_msgs(input, config)
|
131
|
+
else:
|
132
|
+
logger.debug("No contexts and history provided for hallucination evaluation.")
|
133
|
+
return _compose_msgs_default(input, config)
|
134
|
+
|
135
|
+
|
136
|
+
def _compose_msgs(
|
137
|
+
input: EvaluationMetricInput,
|
138
|
+
config: EvaluationMetricConfig,
|
139
|
+
):
|
140
|
+
"""
|
141
|
+
Composes the hallucination analysis messages.
|
142
|
+
"""
|
143
|
+
system_msg_content = _get_system_prompt_with_contexts(config)
|
144
|
+
system_msg = LanguageModelSystemMessage(content=system_msg_content)
|
145
|
+
|
146
|
+
user_msg_templ = Template(_get_user_prompt_with_contexts(config))
|
147
|
+
user_msg_content = user_msg_templ.substitute(
|
148
|
+
input_text=input.input_text,
|
149
|
+
contexts_text=input.get_joined_context_texts(tag_name="reference"),
|
150
|
+
history_messages_text=input.get_joined_history_texts(tag_name="conversation"),
|
151
|
+
output_text=input.output_text,
|
152
|
+
)
|
153
|
+
user_msg = LanguageModelUserMessage(content=user_msg_content)
|
154
|
+
return LanguageModelMessages([system_msg, user_msg])
|
155
|
+
|
156
|
+
|
157
|
+
def _compose_msgs_default(
|
158
|
+
input: EvaluationMetricInput,
|
159
|
+
config: EvaluationMetricConfig,
|
160
|
+
):
|
161
|
+
"""
|
162
|
+
Composes the hallucination analysis prompt without messages.
|
163
|
+
"""
|
164
|
+
system_msg_content = _get_system_prompt_default(config)
|
165
|
+
system_msg = LanguageModelSystemMessage(content=system_msg_content)
|
166
|
+
|
167
|
+
user_msg_templ = Template(_get_user_prompt_default(config))
|
168
|
+
user_msg_content = user_msg_templ.substitute(
|
169
|
+
input_text=input.input_text,
|
170
|
+
output_text=input.output_text,
|
171
|
+
)
|
172
|
+
user_msg = LanguageModelUserMessage(content=user_msg_content)
|
173
|
+
return LanguageModelMessages([system_msg, user_msg])
|
174
|
+
|
175
|
+
|
176
|
+
def _get_system_prompt_with_contexts(config: EvaluationMetricConfig):
|
177
|
+
return config.custom_prompts.setdefault(
|
178
|
+
SYSTEM_MSG_KEY,
|
179
|
+
HALLUCINATION_METRIC_SYSTEM_MSG,
|
180
|
+
)
|
181
|
+
|
182
|
+
|
183
|
+
def _get_user_prompt_with_contexts(config: EvaluationMetricConfig):
|
184
|
+
return config.custom_prompts.setdefault(
|
185
|
+
USER_MSG_KEY,
|
186
|
+
HALLUCINATION_METRIC_USER_MSG,
|
187
|
+
)
|
188
|
+
|
189
|
+
|
190
|
+
def _get_system_prompt_default(config: EvaluationMetricConfig):
|
191
|
+
return config.custom_prompts.setdefault(
|
192
|
+
SYSTEM_MSG_DEFAULT_KEY,
|
193
|
+
HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
|
194
|
+
)
|
195
|
+
|
196
|
+
|
197
|
+
def _get_user_prompt_default(config: EvaluationMetricConfig):
|
198
|
+
return config.custom_prompts.setdefault(
|
199
|
+
USER_MSG_DEFAULT_KEY,
|
200
|
+
HALLUCINATION_METRIC_USER_MSG_DEFAULT,
|
201
|
+
)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from unique_toolkit.evaluators.exception import EvaluatorException
|
2
|
+
from unique_toolkit.evaluators.schemas import (
|
3
|
+
EvaluationMetricName,
|
4
|
+
EvaluationMetricResult,
|
5
|
+
)
|
6
|
+
from unique_toolkit.language_model.utils import convert_string_to_json
|
7
|
+
|
8
|
+
|
9
|
+
def parse_eval_metric_result(
|
10
|
+
result: str,
|
11
|
+
metric_name: EvaluationMetricName,
|
12
|
+
):
|
13
|
+
"""
|
14
|
+
Parses the evaluation metric result.
|
15
|
+
"""
|
16
|
+
|
17
|
+
try:
|
18
|
+
parsed_result = convert_string_to_json(result)
|
19
|
+
except Exception as e:
|
20
|
+
error_message = "Error occurred during parsing the evaluation metric result"
|
21
|
+
raise EvaluatorException(
|
22
|
+
user_message=f"{error_message}.",
|
23
|
+
error_message=f"{error_message}: {str(e)}",
|
24
|
+
)
|
25
|
+
|
26
|
+
return EvaluationMetricResult(
|
27
|
+
name=metric_name,
|
28
|
+
value=parsed_result.get("value", "None"),
|
29
|
+
reason=parsed_result.get("reason", "None"),
|
30
|
+
)
|
@@ -0,0 +1,82 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
from unique_toolkit.chat import ChatMessage
|
7
|
+
from unique_toolkit.evaluators.exception import EvaluatorException
|
8
|
+
|
9
|
+
|
10
|
+
class EvaluationMetricName(Enum):
|
11
|
+
HALLUCINATION = "hallucination"
|
12
|
+
CONTEXT_RELEVANCY = "relevancy"
|
13
|
+
|
14
|
+
|
15
|
+
class EvaluationMetricInputFieldName(str, Enum):
|
16
|
+
INPUT_TEXT = "input_text"
|
17
|
+
CONTEXT_TEXTS = "context_texts"
|
18
|
+
HISTORY_MESSAGES = "history_messages"
|
19
|
+
OUTPUT_TEXT = "output_text"
|
20
|
+
|
21
|
+
|
22
|
+
class EvaluationMetricInput(BaseModel):
|
23
|
+
"""
|
24
|
+
Input for any metric evaluation. Depending on the metric, the input can be different.
|
25
|
+
"""
|
26
|
+
|
27
|
+
input_text: Optional[str] = None
|
28
|
+
context_texts: Optional[list[str]] = None
|
29
|
+
history_messages: Optional[list[ChatMessage]] = None
|
30
|
+
output_text: Optional[str] = None
|
31
|
+
|
32
|
+
def get_joined_context_texts(self, tag_name: str = "reference") -> str:
|
33
|
+
"""
|
34
|
+
Concatenates context_texts.
|
35
|
+
"""
|
36
|
+
if not self.context_texts:
|
37
|
+
return f"<No {tag_name} texts provided>"
|
38
|
+
|
39
|
+
return "\n".join(
|
40
|
+
[
|
41
|
+
f"<{tag_name}-{index}>{text}</{tag_name}-{index}>"
|
42
|
+
for index, text in enumerate(self.context_texts)
|
43
|
+
]
|
44
|
+
)
|
45
|
+
|
46
|
+
def get_history_message_text(self, chat_message: ChatMessage):
|
47
|
+
return f"{chat_message.role.value}: {chat_message.content}"
|
48
|
+
|
49
|
+
def get_history_message_texts(self):
|
50
|
+
if not self.history_messages:
|
51
|
+
return []
|
52
|
+
return [self.get_history_message_text(msg) for msg in self.history_messages]
|
53
|
+
|
54
|
+
def get_joined_history_texts(self, tag_name: str = "conversation") -> str:
|
55
|
+
"""
|
56
|
+
Concatenates history message texts.
|
57
|
+
"""
|
58
|
+
if not self.history_messages:
|
59
|
+
return f"<No {tag_name} texts provided>"
|
60
|
+
|
61
|
+
return "\n".join(self.get_history_message_texts())
|
62
|
+
|
63
|
+
def validate_required_fields(
|
64
|
+
self, required_fields: list[EvaluationMetricInputFieldName]
|
65
|
+
):
|
66
|
+
"""
|
67
|
+
Validates the input fields for the hallucination metric.
|
68
|
+
"""
|
69
|
+
for field in required_fields:
|
70
|
+
value = getattr(self, field)
|
71
|
+
if value is None:
|
72
|
+
error_message = f"Missing required input field: {field}"
|
73
|
+
raise EvaluatorException(
|
74
|
+
user_message=error_message,
|
75
|
+
error_message=error_message,
|
76
|
+
)
|
77
|
+
|
78
|
+
|
79
|
+
class EvaluationMetricResult(BaseModel):
|
80
|
+
name: EvaluationMetricName
|
81
|
+
value: str
|
82
|
+
reason: str
|
@@ -20,6 +20,35 @@ class LanguageModelName(StrEnum):
|
|
20
20
|
AZURE_GPT_4o_MINI_2024_0718 = "AZURE_GPT_4o_MINI_2024_0718"
|
21
21
|
|
22
22
|
|
23
|
+
class EncoderName(StrEnum):
|
24
|
+
O200K_BASE = "o200k_base"
|
25
|
+
CL100K_BASE = "cl100k_base"
|
26
|
+
|
27
|
+
|
28
|
+
def get_encoder_name(model_name: LanguageModelName) -> Optional[EncoderName]:
|
29
|
+
LMN = LanguageModelName
|
30
|
+
match model_name:
|
31
|
+
case (
|
32
|
+
LMN.AZURE_GPT_35_TURBO
|
33
|
+
| LMN.AZURE_GPT_35_TURBO_16K
|
34
|
+
| LMN.AZURE_GPT_35_TURBO_0613
|
35
|
+
):
|
36
|
+
return EncoderName.CL100K_BASE
|
37
|
+
case (
|
38
|
+
LMN.AZURE_GPT_4_0613
|
39
|
+
| LMN.AZURE_GPT_4_TURBO_1106
|
40
|
+
| LMN.AZURE_GPT_4_VISION_PREVIEW
|
41
|
+
| LMN.AZURE_GPT_4_32K_0613
|
42
|
+
| LMN.AZURE_GPT_4_TURBO_2024_0409
|
43
|
+
):
|
44
|
+
return EncoderName.CL100K_BASE
|
45
|
+
case LMN.AZURE_GPT_4o_2024_0513 | LMN.AZURE_GPT_4o_MINI_2024_0718:
|
46
|
+
return EncoderName.O200K_BASE
|
47
|
+
case _:
|
48
|
+
print(f"{model_name} is not supported. Please add encoder information.")
|
49
|
+
return None
|
50
|
+
|
51
|
+
|
23
52
|
class LanguageModelProvider(StrEnum):
|
24
53
|
AZURE = "AZURE"
|
25
54
|
CUSTOM = "CUSTOM"
|
@@ -30,6 +59,7 @@ class LanguageModelInfo(BaseModel):
|
|
30
59
|
version: str
|
31
60
|
provider: LanguageModelProvider
|
32
61
|
|
62
|
+
encoder_name: Optional[EncoderName] = None
|
33
63
|
token_limits: Optional[LanguageModelTokenLimits] = None
|
34
64
|
|
35
65
|
info_cutoff_at: Optional[date] = None
|
@@ -53,6 +83,7 @@ class LanguageModel:
|
|
53
83
|
- name
|
54
84
|
- version
|
55
85
|
- provider
|
86
|
+
- encoder_name
|
56
87
|
- token_limits
|
57
88
|
- info_cutoff_at
|
58
89
|
- published_at
|
@@ -86,6 +117,13 @@ class LanguageModel:
|
|
86
117
|
"""
|
87
118
|
return self._model_info.version
|
88
119
|
|
120
|
+
@property
|
121
|
+
def encoder_name(self) -> Optional[EncoderName]:
|
122
|
+
"""
|
123
|
+
Returns the encoder_name used for the model.
|
124
|
+
"""
|
125
|
+
return self._model_info.encoder_name
|
126
|
+
|
89
127
|
@property
|
90
128
|
def token_limit(self) -> Optional[int]:
|
91
129
|
"""
|
@@ -191,6 +229,7 @@ def create_language_model(
|
|
191
229
|
provider: LanguageModelProvider,
|
192
230
|
info_cutoff_at: date,
|
193
231
|
published_at: date,
|
232
|
+
encoder_name: Optional[EncoderName] = None,
|
194
233
|
token_limit: Optional[int] = None,
|
195
234
|
token_limit_input: Optional[int] = None,
|
196
235
|
token_limit_output: Optional[int] = None,
|
@@ -202,6 +241,7 @@ def create_language_model(
|
|
202
241
|
name=name,
|
203
242
|
version=version,
|
204
243
|
provider=provider,
|
244
|
+
encoder_name=encoder_name,
|
205
245
|
token_limits=LanguageModelTokenLimits(
|
206
246
|
token_limit=token_limit,
|
207
247
|
token_limit_input=token_limit_input,
|
@@ -229,6 +269,7 @@ AzureGpt35Turbo0613 = create_language_model(
|
|
229
269
|
name=LanguageModelName.AZURE_GPT_35_TURBO_0613,
|
230
270
|
provider=LanguageModelProvider.AZURE,
|
231
271
|
version="0613",
|
272
|
+
encoder_name=get_encoder_name(LanguageModelName.AZURE_GPT_35_TURBO_0613),
|
232
273
|
token_limit=8192,
|
233
274
|
info_cutoff_at=date(2021, 9, 1),
|
234
275
|
published_at=date(2023, 6, 13),
|
@@ -239,6 +280,7 @@ AzureGpt35Turbo = create_language_model(
|
|
239
280
|
name=LanguageModelName.AZURE_GPT_35_TURBO,
|
240
281
|
provider=LanguageModelProvider.AZURE,
|
241
282
|
version="0301",
|
283
|
+
encoder_name=get_encoder_name(LanguageModelName.AZURE_GPT_35_TURBO),
|
242
284
|
token_limit=4096,
|
243
285
|
info_cutoff_at=date(2021, 9, 1),
|
244
286
|
published_at=date(2023, 3, 1),
|
@@ -249,6 +291,7 @@ AzureGpt35Turbo16k = create_language_model(
|
|
249
291
|
name=LanguageModelName.AZURE_GPT_35_TURBO_16K,
|
250
292
|
provider=LanguageModelProvider.AZURE,
|
251
293
|
version="0613",
|
294
|
+
encoder_name=get_encoder_name(LanguageModelName.AZURE_GPT_35_TURBO_16K),
|
252
295
|
token_limit=16382,
|
253
296
|
info_cutoff_at=date(2021, 9, 1),
|
254
297
|
published_at=date(2023, 6, 13),
|
@@ -260,6 +303,7 @@ AzureGpt40613 = create_language_model(
|
|
260
303
|
name=LanguageModelName.AZURE_GPT_4_0613,
|
261
304
|
provider=LanguageModelProvider.AZURE,
|
262
305
|
version="0613",
|
306
|
+
encoder_name=get_encoder_name(LanguageModelName.AZURE_GPT_4_0613),
|
263
307
|
token_limit=8192,
|
264
308
|
info_cutoff_at=date(2021, 9, 1),
|
265
309
|
published_at=date(2023, 6, 13),
|
@@ -272,6 +316,7 @@ AzureGpt4Turbo1106 = create_language_model(
|
|
272
316
|
name=LanguageModelName.AZURE_GPT_4_TURBO_1106,
|
273
317
|
provider=LanguageModelProvider.AZURE,
|
274
318
|
version="1106-preview",
|
319
|
+
encoder_name=get_encoder_name(LanguageModelName.AZURE_GPT_4_TURBO_1106),
|
275
320
|
token_limit_input=128000,
|
276
321
|
token_limit_output=4096,
|
277
322
|
info_cutoff_at=date(2023, 4, 1),
|
@@ -283,6 +328,7 @@ AzureGpt4VisionPreview = create_language_model(
|
|
283
328
|
name=LanguageModelName.AZURE_GPT_4_VISION_PREVIEW,
|
284
329
|
provider=LanguageModelProvider.AZURE,
|
285
330
|
version="vision-preview",
|
331
|
+
encoder_name=get_encoder_name(LanguageModelName.AZURE_GPT_4_VISION_PREVIEW),
|
286
332
|
token_limit_input=128000,
|
287
333
|
token_limit_output=4096,
|
288
334
|
info_cutoff_at=date(2023, 4, 1),
|
@@ -293,6 +339,7 @@ AzureGpt432k0613 = create_language_model(
|
|
293
339
|
name=LanguageModelName.AZURE_GPT_4_32K_0613,
|
294
340
|
provider=LanguageModelProvider.AZURE,
|
295
341
|
version="1106-preview",
|
342
|
+
encoder_name=get_encoder_name(LanguageModelName.AZURE_GPT_4_32K_0613),
|
296
343
|
token_limit=32768,
|
297
344
|
info_cutoff_at=date(2021, 9, 1),
|
298
345
|
published_at=date(2023, 6, 13),
|
@@ -302,6 +349,7 @@ AzureGpt432k0613 = create_language_model(
|
|
302
349
|
|
303
350
|
AzureGpt4Turbo20240409 = create_language_model(
|
304
351
|
name=LanguageModelName.AZURE_GPT_4_TURBO_2024_0409,
|
352
|
+
encoder_name=get_encoder_name(LanguageModelName.AZURE_GPT_4_TURBO_2024_0409),
|
305
353
|
provider=LanguageModelProvider.AZURE,
|
306
354
|
version="turbo-2024-04-09",
|
307
355
|
token_limit_input=128000,
|
@@ -312,6 +360,7 @@ AzureGpt4Turbo20240409 = create_language_model(
|
|
312
360
|
|
313
361
|
AzureGpt4o20240513 = create_language_model(
|
314
362
|
name=LanguageModelName.AZURE_GPT_4o_2024_0513,
|
363
|
+
encoder_name=get_encoder_name(LanguageModelName.AZURE_GPT_4o_2024_0513),
|
315
364
|
provider=LanguageModelProvider.AZURE,
|
316
365
|
version="2024-05-13",
|
317
366
|
token_limit_input=128000,
|
@@ -324,6 +373,7 @@ AzureGpt4oMini20240718 = create_language_model(
|
|
324
373
|
name=LanguageModelName.AZURE_GPT_4o_MINI_2024_0718,
|
325
374
|
provider=LanguageModelProvider.AZURE,
|
326
375
|
version="2024-07-18",
|
376
|
+
encoder_name=get_encoder_name(LanguageModelName.AZURE_GPT_4o_MINI_2024_0718),
|
327
377
|
token_limit_input=128000,
|
328
378
|
token_limit_output=16384,
|
329
379
|
info_cutoff_at=date(2023, 10, 1),
|
@@ -3,7 +3,14 @@ from enum import StrEnum
|
|
3
3
|
from typing import Any, Optional, Self
|
4
4
|
|
5
5
|
from humps import camelize
|
6
|
-
from pydantic import
|
6
|
+
from pydantic import (
|
7
|
+
BaseModel,
|
8
|
+
ConfigDict,
|
9
|
+
Field,
|
10
|
+
RootModel,
|
11
|
+
field_validator,
|
12
|
+
model_validator,
|
13
|
+
)
|
7
14
|
|
8
15
|
# set config to convert camelCase to snake_case
|
9
16
|
model_config = ConfigDict(
|
@@ -173,7 +180,11 @@ class LanguageModelToolParameters(BaseModel):
|
|
173
180
|
|
174
181
|
|
175
182
|
class LanguageModelTool(BaseModel):
|
176
|
-
name: str
|
183
|
+
name: str = Field(
|
184
|
+
...,
|
185
|
+
pattern=r"^[a-zA-Z_-]+$",
|
186
|
+
description="Name must adhere to the pattern ^[a-zA-Z_-]+$",
|
187
|
+
)
|
177
188
|
description: str
|
178
189
|
parameters: LanguageModelToolParameters
|
179
190
|
returns: LanguageModelToolParameterProperty | LanguageModelToolParameters | None = (
|
@@ -196,6 +196,7 @@ class LanguageModelService(BaseService):
|
|
196
196
|
The LanguageModelStreamResponse object once the stream has finished.
|
197
197
|
"""
|
198
198
|
options = self._add_tools_to_options({}, tools)
|
199
|
+
options["temperature"] = temperature
|
199
200
|
search_context = self._to_search_context(content_chunks)
|
200
201
|
messages = messages.model_dump(exclude_none=True)
|
201
202
|
model = (
|
@@ -217,7 +218,6 @@ class LanguageModelService(BaseService):
|
|
217
218
|
# TODO change or extend types in unique_sdk
|
218
219
|
model=model,
|
219
220
|
timeout=timeout,
|
220
|
-
temperature=temperature,
|
221
221
|
assistantId=self.event.payload.assistant_id,
|
222
222
|
debugInfo=debug_info,
|
223
223
|
options=options, # type: ignore
|
@@ -257,6 +257,7 @@ class LanguageModelService(BaseService):
|
|
257
257
|
"""
|
258
258
|
|
259
259
|
options = self._add_tools_to_options({}, tools)
|
260
|
+
options["temperature"] = temperature
|
260
261
|
search_context = self._to_search_context(content_chunks)
|
261
262
|
messages = messages.model_dump(exclude_none=True, exclude=["tool_calls"])
|
262
263
|
model = (
|
@@ -277,7 +278,6 @@ class LanguageModelService(BaseService):
|
|
277
278
|
searchContext=search_context,
|
278
279
|
model=model,
|
279
280
|
timeout=timeout,
|
280
|
-
temperature=temperature,
|
281
281
|
assistantId=self.event.payload.assistant_id,
|
282
282
|
debugInfo=debug_info,
|
283
283
|
# TODO change or extend types in unique_sdk
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: unique_toolkit
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.28
|
4
4
|
Summary:
|
5
5
|
License: Proprietary
|
6
6
|
Author: Martin Fadler
|
@@ -17,7 +17,7 @@ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
|
17
17
|
Requires-Dist: regex (>=2024.5.15,<2025.0.0)
|
18
18
|
Requires-Dist: tiktoken (>=0.7.0,<0.8.0)
|
19
19
|
Requires-Dist: typing-extensions (>=4.9.0,<5.0.0)
|
20
|
-
Requires-Dist: unique-sdk (>=0.9.
|
20
|
+
Requires-Dist: unique-sdk (>=0.9.8,<0.10.0)
|
21
21
|
Description-Content-Type: text/markdown
|
22
22
|
|
23
23
|
# Unique Toolkit
|
@@ -100,6 +100,20 @@ All notable changes to this project will be documented in this file.
|
|
100
100
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
101
101
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
102
102
|
|
103
|
+
## [0.5.28] - 2024-10-23
|
104
|
+
- Correctly use `temperature` parameter in `LanguageModelService.complete`, `LanguageModelService.complete_async`, `LanguageModelService.stream_complete` and `LanguageModelService.stream_complete_async` methods
|
105
|
+
|
106
|
+
## [0.5.27] - 2024-10-22
|
107
|
+
- Add encoder_name to to language model info
|
108
|
+
- Verify tool name for `LanguageModelTool` to conform with frontent requirements.
|
109
|
+
- Add `search_on_chat` to `ContentService`
|
110
|
+
|
111
|
+
## [0.5.26] - 2024-10-16
|
112
|
+
- Bump `unique_sdk` version
|
113
|
+
|
114
|
+
## [0.5.25] - 2024-09-26
|
115
|
+
- Add `evaluators` for hallucination and context relevancy evaluation
|
116
|
+
|
103
117
|
## [0.5.24] - 2024-09-26
|
104
118
|
- Add `originalText` to `_construct_message_modify_params` and `_construct_message_create_params`. This addition makes sure that the `originalText` on the database is populated with the `text`
|
105
119
|
|
@@ -1,6 +1,8 @@
|
|
1
1
|
unique_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
unique_toolkit/_common/_base_service.py,sha256=S8H0rAebx7GsOldA7xInLp3aQJt9yEPDQdsGSFRJsGg,276
|
3
3
|
unique_toolkit/_common/_time_utils.py,sha256=ztmTovTvr-3w71Ns2VwXC65OKUUh-sQlzbHdKTQWm-w,135
|
4
|
+
unique_toolkit/_common/exception.py,sha256=caQIE1btsQnpKCHqL2cgWUSbHup06enQu_Pt7uGUTTE,727
|
5
|
+
unique_toolkit/_common/validators.py,sha256=w5lzvRxl0sBTvv0CXLF9UwtJyKmmS2lez0KXaqapgBE,258
|
4
6
|
unique_toolkit/app/__init__.py,sha256=sZyGrz74jBlAjv6OcHgcp6VtP6-AKKpaVYjakr1Xk60,735
|
5
7
|
unique_toolkit/app/init_logging.py,sha256=Sh26SRxOj8i8dzobKhYha2lLrkrMTHfB1V4jR3h23gQ,678
|
6
8
|
unique_toolkit/app/init_sdk.py,sha256=Nv4Now4pMfM0AgRhbtatLpm_39rKxn0WmRLwmPhRl-8,1285
|
@@ -15,18 +17,30 @@ unique_toolkit/chat/state.py,sha256=Cjgwv_2vhDFbV69xxsn7SefhaoIAEqLx3ferdVFCnOg,
|
|
15
17
|
unique_toolkit/chat/utils.py,sha256=ihm-wQykBWhB4liR3LnwPVPt_qGW6ETq21Mw4HY0THE,854
|
16
18
|
unique_toolkit/content/__init__.py,sha256=MSH2sxjQyKD2Sef92fzE5Dt9SihdzivB6yliSwJfTmQ,890
|
17
19
|
unique_toolkit/content/schemas.py,sha256=zks_Pkki2VhxICJJgHZyc-LPmRuj5dLbw3pgcUT7SW8,2362
|
18
|
-
unique_toolkit/content/service.py,sha256=
|
20
|
+
unique_toolkit/content/service.py,sha256=ZGYWYTphXpcByXyMqr1VOVUHdmdnsR-XIS_YRX0Wyv4,14211
|
19
21
|
unique_toolkit/content/utils.py,sha256=Lake671plRsqNvO3pN_rmyVcpwbdED_KQpLcCnc4lv4,6902
|
20
22
|
unique_toolkit/embedding/__init__.py,sha256=dr8M9jvslQTxPpxgaGwzxY0FildiWf-DidN_cahPAWw,191
|
21
23
|
unique_toolkit/embedding/schemas.py,sha256=1GvKCaSk4jixzVQ2PKq8yDqwGEVY_hWclYtoAr6CC2g,96
|
22
24
|
unique_toolkit/embedding/service.py,sha256=Iiw-sbdkjuWlWMfLM9qyC4GNTJOotQAaVjkYvh5Su4Y,2370
|
23
25
|
unique_toolkit/embedding/utils.py,sha256=v86lo__bCJbxZBQ3OcLu5SuwT6NbFfWlcq8iyk6BuzQ,279
|
26
|
+
unique_toolkit/evaluators/config.py,sha256=JRSHJvIjioXDMgd9hodK10J-52j3LMgJFvG0Vy7ePa8,1056
|
27
|
+
unique_toolkit/evaluators/context_relevancy/constants.py,sha256=YErC92sqsY31cmBUG3dFQw78mUjbcpjMG7TLfYuLYmw,1051
|
28
|
+
unique_toolkit/evaluators/context_relevancy/prompts.py,sha256=gTlWP7fDuxhrXhCYNCqXMbCey_DalZMdi5l-a6RHgk0,713
|
29
|
+
unique_toolkit/evaluators/context_relevancy/service.py,sha256=9hzdMuF4A4T97-3X3zcXgrDISLn1bleZ6tTL1bHa9dQ,1722
|
30
|
+
unique_toolkit/evaluators/context_relevancy/utils.py,sha256=DCFaoxZT_qDMKirjy3hTo1DIE7HpZ7-XR5P-rHuAoHQ,5137
|
31
|
+
unique_toolkit/evaluators/exception.py,sha256=7lcVbCyoN4Md1chNJDFxpUYyWbVrcr9dcc3TxWykJTc,115
|
32
|
+
unique_toolkit/evaluators/hallucination/constants.py,sha256=DEycXlxY9h01D0iF3aU5LIdPrDJ-5OkF0VdXDLn_tSs,1440
|
33
|
+
unique_toolkit/evaluators/hallucination/prompts.py,sha256=9yCpO_WGLDvYfPWKL1VuRA-jt0P_-A-qvLUOmuv-Nks,3320
|
34
|
+
unique_toolkit/evaluators/hallucination/service.py,sha256=k8qro5Lw4Ak58m4HYp3G4HPLIaexeFySIIVvW6fAdeA,2408
|
35
|
+
unique_toolkit/evaluators/hallucination/utils.py,sha256=507BsX1mFTEne1-LdRCNMgBj-IXSFvBj1t3BPe1UkGs,7639
|
36
|
+
unique_toolkit/evaluators/output_parser.py,sha256=eI72qkzK1dZyUvnfP2SOAQCGBj_-PwX5wy_aLPMsJMY,883
|
37
|
+
unique_toolkit/evaluators/schemas.py,sha256=Jaue6Uhx75X1CyHKWj8sT3RE1JZXTqoLtfLt2xQNCX8,2507
|
24
38
|
unique_toolkit/language_model/__init__.py,sha256=YuhyczGPj6w9xX-sOVUhmozvzIFxcckHFEkeMBecr5s,1784
|
25
|
-
unique_toolkit/language_model/infos.py,sha256=
|
26
|
-
unique_toolkit/language_model/schemas.py,sha256=
|
27
|
-
unique_toolkit/language_model/service.py,sha256=
|
39
|
+
unique_toolkit/language_model/infos.py,sha256=Oxkr9_6s8gFubxjox-iCm1GSs1RCAQQ5t8oh20izlC0,12002
|
40
|
+
unique_toolkit/language_model/schemas.py,sha256=LO3QHsyFuJXG3HxXWFf44QV28JJzW8YW5TeIYhVzZTI,5035
|
41
|
+
unique_toolkit/language_model/service.py,sha256=R8j2cr-lDbR96Vl5LVQIdtscS0gfscezKMXNMM2AZHM,13403
|
28
42
|
unique_toolkit/language_model/utils.py,sha256=WBPj1XKkDgxy_-T8HCZvsfkkSzj_1w4UZzNmyvdbBLY,1081
|
29
|
-
unique_toolkit-0.5.
|
30
|
-
unique_toolkit-0.5.
|
31
|
-
unique_toolkit-0.5.
|
32
|
-
unique_toolkit-0.5.
|
43
|
+
unique_toolkit-0.5.28.dist-info/LICENSE,sha256=GlN8wHNdh53xwOPg44URnwag6TEolCjoq3YD_KrWgss,193
|
44
|
+
unique_toolkit-0.5.28.dist-info/METADATA,sha256=mvFUKA2gYyPWx19cex2Y6YIdoZ-WSwx0LeTGymenwVo,12521
|
45
|
+
unique_toolkit-0.5.28.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
46
|
+
unique_toolkit-0.5.28.dist-info/RECORD,,
|
File without changes
|
File without changes
|