unique_toolkit 0.8.15__tar.gz → 0.8.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/CHANGELOG.md +3 -0
  2. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/PKG-INFO +4 -1
  3. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/pyproject.toml +1 -1
  4. unique_toolkit-0.8.16/unique_toolkit/evals/config.py +36 -0
  5. unique_toolkit-0.8.16/unique_toolkit/evals/context_relevancy/prompts.py +56 -0
  6. unique_toolkit-0.8.16/unique_toolkit/evals/context_relevancy/schema.py +88 -0
  7. unique_toolkit-0.8.16/unique_toolkit/evals/context_relevancy/service.py +241 -0
  8. unique_toolkit-0.8.16/unique_toolkit/evals/hallucination/constants.py +61 -0
  9. unique_toolkit-0.8.16/unique_toolkit/evals/hallucination/hallucination_evaluation.py +92 -0
  10. unique_toolkit-0.8.16/unique_toolkit/evals/hallucination/prompts.py +79 -0
  11. unique_toolkit-0.8.16/unique_toolkit/evals/hallucination/service.py +57 -0
  12. unique_toolkit-0.8.16/unique_toolkit/evals/hallucination/utils.py +213 -0
  13. unique_toolkit-0.8.16/unique_toolkit/evals/output_parser.py +48 -0
  14. unique_toolkit-0.8.16/unique_toolkit/evals/tests/test_context_relevancy_service.py +252 -0
  15. unique_toolkit-0.8.16/unique_toolkit/evals/tests/test_output_parser.py +80 -0
  16. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/history_manager/history_manager.py +3 -8
  17. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/language_model/schemas.py +8 -0
  18. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/LICENSE +0 -0
  19. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/README.md +0 -0
  20. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/__init__.py +0 -0
  21. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/_common/_base_service.py +0 -0
  22. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/_common/_time_utils.py +0 -0
  23. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/_common/default_language_model.py +0 -0
  24. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/_common/exception.py +0 -0
  25. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/_common/token/image_token_counting.py +0 -0
  26. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/_common/token/token_counting.py +0 -0
  27. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/_common/validate_required_values.py +0 -0
  28. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/_common/validators.py +0 -0
  29. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/app/__init__.py +0 -0
  30. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/app/dev_util.py +0 -0
  31. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/app/init_logging.py +0 -0
  32. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/app/init_sdk.py +0 -0
  33. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/app/performance/async_tasks.py +0 -0
  34. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/app/performance/async_wrapper.py +0 -0
  35. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/app/schemas.py +0 -0
  36. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/app/unique_settings.py +0 -0
  37. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/app/verification.py +0 -0
  38. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/chat/__init__.py +0 -0
  39. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/chat/constants.py +0 -0
  40. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/chat/functions.py +0 -0
  41. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/chat/schemas.py +0 -0
  42. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/chat/service.py +0 -0
  43. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/chat/state.py +0 -0
  44. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/chat/utils.py +0 -0
  45. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/content/__init__.py +0 -0
  46. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/content/constants.py +0 -0
  47. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/content/functions.py +0 -0
  48. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/content/schemas.py +0 -0
  49. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/content/service.py +0 -0
  50. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/content/utils.py +0 -0
  51. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/embedding/__init__.py +0 -0
  52. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/embedding/constants.py +0 -0
  53. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/embedding/functions.py +0 -0
  54. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/embedding/schemas.py +0 -0
  55. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/embedding/service.py +0 -0
  56. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/embedding/utils.py +0 -0
  57. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evals/evaluation_manager.py +0 -0
  58. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evals/exception.py +0 -0
  59. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evals/schemas.py +0 -0
  60. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/__init__.py +0 -0
  61. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/config.py +0 -0
  62. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/constants.py +0 -0
  63. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/context_relevancy/constants.py +0 -0
  64. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/context_relevancy/prompts.py +0 -0
  65. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/context_relevancy/service.py +0 -0
  66. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/context_relevancy/utils.py +0 -0
  67. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/exception.py +0 -0
  68. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/hallucination/constants.py +0 -0
  69. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/hallucination/prompts.py +0 -0
  70. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/hallucination/service.py +0 -0
  71. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/hallucination/utils.py +0 -0
  72. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/output_parser.py +0 -0
  73. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/evaluators/schemas.py +0 -0
  74. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/framework_utilities/langchain/client.py +0 -0
  75. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/framework_utilities/langchain/history.py +0 -0
  76. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/framework_utilities/openai/client.py +0 -0
  77. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/framework_utilities/openai/message_builder.py +0 -0
  78. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/framework_utilities/utils.py +0 -0
  79. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/history_manager/history_construction_with_contents.py +0 -0
  80. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/history_manager/loop_token_reducer.py +0 -0
  81. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/history_manager/utils.py +0 -0
  82. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/language_model/__init__.py +0 -0
  83. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/language_model/builder.py +0 -0
  84. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/language_model/constants.py +0 -0
  85. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/language_model/functions.py +0 -0
  86. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/language_model/infos.py +0 -0
  87. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/language_model/prompt.py +0 -0
  88. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/language_model/reference.py +0 -0
  89. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/language_model/service.py +0 -0
  90. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/language_model/utils.py +0 -0
  91. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/postprocessor/postprocessor_manager.py +0 -0
  92. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/protocols/support.py +0 -0
  93. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/reference_manager/reference_manager.py +0 -0
  94. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/short_term_memory/__init__.py +0 -0
  95. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/short_term_memory/constants.py +0 -0
  96. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/short_term_memory/functions.py +0 -0
  97. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/short_term_memory/persistent_short_term_memory_manager.py +0 -0
  98. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/short_term_memory/schemas.py +0 -0
  99. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/short_term_memory/service.py +0 -0
  100. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/smart_rules/__init__.py +0 -0
  101. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/smart_rules/compile.py +0 -0
  102. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/thinking_manager/thinking_manager.py +0 -0
  103. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/agent_chunks_handler.py +0 -0
  104. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/config.py +0 -0
  105. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/factory.py +0 -0
  106. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/schemas.py +0 -0
  107. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/test/test_tool_progress_reporter.py +0 -0
  108. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/tool.py +0 -0
  109. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/tool_manager.py +0 -0
  110. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/tool_progress_reporter.py +0 -0
  111. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/utils/execution/execution.py +0 -0
  112. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/utils/source_handling/schema.py +0 -0
  113. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/utils/source_handling/source_formatting.py +0 -0
  114. {unique_toolkit-0.8.15 → unique_toolkit-0.8.16}/unique_toolkit/tools/utils/source_handling/tests/test_source_formatting.py +0 -0
@@ -5,6 +5,9 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.8.16] - 2025-08-19
9
+ - moved Hallucination evaluator into toolkit
10
+
8
11
  ## [0.8.15] - 2025-08-19
9
12
  - Added history loading from database for History Manager
10
13
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unique_toolkit
3
- Version: 0.8.15
3
+ Version: 0.8.16
4
4
  Summary:
5
5
  License: Proprietary
6
6
  Author: Martin Fadler
@@ -114,6 +114,9 @@ All notable changes to this project will be documented in this file.
114
114
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
115
115
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
116
116
 
117
+ ## [0.8.16] - 2025-08-19
118
+ - moved Hallucination evaluator into toolkit
119
+
117
120
  ## [0.8.15] - 2025-08-19
118
121
  - Added history loading from database for History Manager
119
122
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "unique_toolkit"
3
- version = "0.8.15"
3
+ version = "0.8.16"
4
4
  description = ""
5
5
  authors = [
6
6
  "Martin Fadler <martin.fadler@unique.ch>",
@@ -0,0 +1,36 @@
1
+ from typing import Any
2
+
3
+ from humps import camelize
4
+ from pydantic import BaseModel, ConfigDict, Field
5
+
6
+ from unique_toolkit._common.validators import LMI
7
+ from unique_toolkit.language_model.infos import LanguageModelInfo, LanguageModelName
8
+
9
+
10
+ from .schemas import (
11
+ EvaluationMetricName,
12
+ )
13
+
14
+ model_config = ConfigDict(
15
+ alias_generator=camelize,
16
+ populate_by_name=True,
17
+ arbitrary_types_allowed=True,
18
+ validate_default=True,
19
+ )
20
+
21
+
22
+ class EvaluationMetricConfig(BaseModel):
23
+ model_config = model_config
24
+
25
+ enabled: bool = False
26
+ name: EvaluationMetricName
27
+ language_model: LMI = LanguageModelInfo.from_name(
28
+ LanguageModelName.AZURE_GPT_35_TURBO_0125,
29
+ )
30
+ additional_llm_options: dict[str, Any] = Field(
31
+ default={},
32
+ description="Additional options to pass to the language model.",
33
+ )
34
+ custom_prompts: dict[str, str] = {}
35
+ score_to_label: dict[str, str] = {}
36
+ score_to_title: dict[str, str] = {}
@@ -0,0 +1,56 @@
1
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG = """
2
+ You will receive an input and a set of contexts.
3
+ Your task is to evaluate how relevant the contexts are to the input text.
4
+
5
+ Use the following rating scale to generate a score:
6
+ [low] - The contexts are not relevant to the input.
7
+ [medium] - The contexts are somewhat relevant to the input.
8
+ [high] - The contexts are highly relevant to the input.
9
+
10
+ Your answer must be in JSON format:
11
+ {
12
+ "reason": Your explanation of your judgement of the evaluation,
13
+ "value": decision, must be one of the following ["low", "medium", "high"]
14
+ }
15
+ """
16
+
17
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT = """
18
+ You will receive an input and a set of contexts.
19
+ Your task is to evaluate how relevant the contexts are to the input text.
20
+ Further you should extract relevant facts from the contexts.
21
+
22
+ # Output Format
23
+ - Generate data according to the provided data schema.
24
+ - Ensure the output adheres to the format required by the pydantic object.
25
+ - All necessary fields should be populated as per the data schema guidelines.
26
+ """
27
+
28
+ CONTEXT_RELEVANCY_METRIC_USER_MSG = """
29
+ Here is the data:
30
+
31
+ Input:
32
+ '''
33
+ $input_text
34
+ '''
35
+
36
+ Contexts:
37
+ '''
38
+ $context_texts
39
+ '''
40
+
41
+ Answer as JSON:
42
+ """
43
+
44
+ CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT = """
45
+ Here is the data:
46
+
47
+ Input:
48
+ '''
49
+ $input_text
50
+ '''
51
+
52
+ Contexts:
53
+ '''
54
+ $context_texts
55
+ '''
56
+ """
@@ -0,0 +1,88 @@
1
+ from pydantic import BaseModel, Field, create_model
2
+ from pydantic.json_schema import SkipJsonSchema
3
+
4
+
5
+
6
+ from pydantic import BaseModel, ConfigDict
7
+
8
+ from unique_toolkit.tools.config import get_configuration_dict
9
+
10
+
11
+ class StructuredOutputModel(BaseModel):
12
+ model_config = ConfigDict(extra="forbid")
13
+
14
+
15
+
16
+ class StructuredOutputConfig(BaseModel):
17
+ model_config = get_configuration_dict()
18
+
19
+ enabled: bool = Field(
20
+ default=False,
21
+ description="Whether to use structured output for the evaluation.",
22
+ )
23
+ extract_fact_list: bool = Field(
24
+ default=False,
25
+ description="Whether to extract a list of relevant facts from context chunks with structured output.",
26
+ )
27
+ reason_description: str = Field(
28
+ default="A brief explanation justifying your evaluation decision.",
29
+ description="The description of the reason field for structured output.",
30
+ )
31
+ value_description: str = Field(
32
+ default="Assessment of how relevant the facts are to the query. Must be one of: ['low', 'medium', 'high'].",
33
+ description="The description of the value field for structured output.",
34
+ )
35
+
36
+ fact_description: str = Field(
37
+ default="A fact is an information that is directly answers the user's query. Make sure to emphasize the important information from the fact with bold text.",
38
+ description="The description of the fact field for structured output.",
39
+ )
40
+ fact_list_description: str = Field(
41
+ default="A list of relevant facts extracted from the source that supports or answers the user's query.",
42
+ description="The description of the fact list field for structured output.",
43
+ )
44
+
45
+
46
+ class Fact(StructuredOutputModel):
47
+ fact: str
48
+
49
+
50
+ class EvaluationSchemaStructuredOutput(StructuredOutputModel):
51
+ reason: str
52
+ value: str
53
+
54
+ fact_list: list[Fact] = Field(default_factory=list[Fact])
55
+
56
+ @classmethod
57
+ def get_with_descriptions(cls, config: StructuredOutputConfig):
58
+ if config.extract_fact_list:
59
+ FactWithDescription = create_model(
60
+ "Fact",
61
+ fact=(str, Field(..., description=config.fact_description)),
62
+ __base__=Fact,
63
+ )
64
+ fact_list_field = (
65
+ list[FactWithDescription],
66
+ Field(
67
+ description=config.fact_list_description,
68
+ ),
69
+ )
70
+ else:
71
+ fact_list_field = (
72
+ SkipJsonSchema[list[Fact]],
73
+ Field(default_factory=list[Fact]),
74
+ )
75
+
76
+ return create_model(
77
+ "EvaluationSchemaStructuredOutputWithDescription",
78
+ reason=(
79
+ str,
80
+ Field(..., description=config.reason_description),
81
+ ),
82
+ value=(
83
+ str,
84
+ Field(..., description=config.value_description),
85
+ ),
86
+ fact_list=fact_list_field,
87
+ __base__=cls,
88
+ )
@@ -0,0 +1,241 @@
1
+ import logging
2
+
3
+ from pydantic import BaseModel, ValidationError
4
+ from unique_toolkit.app.schemas import ChatEvent
5
+ from unique_toolkit.chat.service import ChatService
6
+ from unique_toolkit.language_model.infos import (
7
+ LanguageModelInfo,
8
+ LanguageModelName,
9
+ ModelCapabilities,
10
+ )
11
+ from unique_toolkit.language_model.prompt import Prompt
12
+ from unique_toolkit.language_model.schemas import (
13
+ LanguageModelMessages,
14
+ )
15
+ from unique_toolkit.language_model.service import (
16
+ LanguageModelService,
17
+ )
18
+ from unique_toolkit.evals.config import EvaluationMetricConfig
19
+ from unique_toolkit.evals.context_relevancy.schema import (
20
+ EvaluationSchemaStructuredOutput,
21
+ )
22
+ from unique_toolkit.evals.exception import EvaluatorException
23
+ from unique_toolkit.evals.output_parser import (
24
+ parse_eval_metric_result,
25
+ parse_eval_metric_result_structured_output,
26
+ )
27
+ from unique_toolkit.evals.schemas import (
28
+ EvaluationMetricInput,
29
+ EvaluationMetricInputFieldName,
30
+ EvaluationMetricName,
31
+ EvaluationMetricResult,
32
+ )
33
+
34
+
35
+ from .prompts import (
36
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
37
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT,
38
+ CONTEXT_RELEVANCY_METRIC_USER_MSG,
39
+ CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT,
40
+ )
41
+
42
+ SYSTEM_MSG_KEY = "systemPrompt"
43
+ USER_MSG_KEY = "userPrompt"
44
+
45
+ default_config = EvaluationMetricConfig(
46
+ enabled=False,
47
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
48
+ language_model=LanguageModelInfo.from_name(
49
+ LanguageModelName.AZURE_GPT_4o_2024_1120
50
+ ),
51
+ custom_prompts={
52
+ SYSTEM_MSG_KEY: CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
53
+ USER_MSG_KEY: CONTEXT_RELEVANCY_METRIC_USER_MSG,
54
+ },
55
+ )
56
+
57
+ relevancy_required_input_fields = [
58
+ EvaluationMetricInputFieldName.INPUT_TEXT,
59
+ EvaluationMetricInputFieldName.CONTEXT_TEXTS,
60
+ ]
61
+
62
+
63
+ class ContextRelevancyEvaluator:
64
+ def __init__(
65
+ self,
66
+ event: ChatEvent,
67
+ ):
68
+ self.chat_service = ChatService(event)
69
+ self.language_model_service = LanguageModelService(event)
70
+ self.logger = logging.getLogger(f"ContextRelevancyEvaluator.{__name__}")
71
+
72
+ async def analyze(
73
+ self,
74
+ input: EvaluationMetricInput,
75
+ config: EvaluationMetricConfig = default_config,
76
+ structured_output_schema: type[BaseModel] | None = None,
77
+ ) -> EvaluationMetricResult | None:
78
+ """
79
+ Analyzes the level of relevancy of a context by comparing
80
+ it with the input text.
81
+
82
+ Args:
83
+ input (EvaluationMetricInput): The input for the metric.
84
+ config (EvaluationMetricConfig): The configuration for the metric.
85
+
86
+ Returns:
87
+ EvaluationMetricResult | None
88
+
89
+ Raises:
90
+ EvaluatorException: If the context texts are empty or required fields are missing or error occurred during evaluation.
91
+ """
92
+ if config.enabled is False:
93
+ self.logger.info("Hallucination metric is not enabled.")
94
+ return None
95
+
96
+ input.validate_required_fields(relevancy_required_input_fields)
97
+
98
+ if len(input.context_texts) == 0: # type: ignore
99
+ error_message = "No context texts provided."
100
+ raise EvaluatorException(
101
+ user_message=error_message,
102
+ error_message=error_message,
103
+ )
104
+
105
+ try:
106
+ # Handle structured output if enabled and supported by the model
107
+ if (
108
+ structured_output_schema
109
+ and ModelCapabilities.STRUCTURED_OUTPUT
110
+ in config.language_model.capabilities
111
+ ):
112
+ return await self._handle_structured_output(
113
+ input, config, structured_output_schema
114
+ )
115
+
116
+ # Handle regular output
117
+ return await self._handle_regular_output(input, config)
118
+
119
+ except Exception as e:
120
+ error_message = (
121
+ "Unknown error occurred during context relevancy metric analysis"
122
+ )
123
+ raise EvaluatorException(
124
+ error_message=f"{error_message}: {e}",
125
+ user_message=error_message,
126
+ exception=e,
127
+ )
128
+
129
+ async def _handle_structured_output(
130
+ self,
131
+ input: EvaluationMetricInput,
132
+ config: EvaluationMetricConfig,
133
+ structured_output_schema: type[BaseModel],
134
+ ) -> EvaluationMetricResult:
135
+ """Handle the structured output case for context relevancy evaluation."""
136
+ self.logger.info("Using structured output for context relevancy evaluation.")
137
+ msgs = self._compose_msgs(input, config, enable_structured_output=True)
138
+ result = await self.language_model_service.complete_async(
139
+ messages=msgs,
140
+ model_name=config.language_model.name,
141
+ structured_output_model=structured_output_schema,
142
+ structured_output_enforce_schema=True,
143
+ other_options=config.additional_llm_options,
144
+ )
145
+
146
+ try:
147
+ result_content = EvaluationSchemaStructuredOutput.model_validate(
148
+ result.choices[0].message.parsed
149
+ )
150
+ except ValidationError as e:
151
+ error_message = "Error occurred during structured output validation of the context relevancy evaluation."
152
+ raise EvaluatorException(
153
+ error_message=error_message,
154
+ user_message=error_message,
155
+ exception=e,
156
+ )
157
+
158
+ return parse_eval_metric_result_structured_output(
159
+ result_content, EvaluationMetricName.CONTEXT_RELEVANCY
160
+ )
161
+
162
+ async def _handle_regular_output(
163
+ self,
164
+ input: EvaluationMetricInput,
165
+ config: EvaluationMetricConfig,
166
+ ) -> EvaluationMetricResult:
167
+ """Handle the regular output case for context relevancy evaluation."""
168
+ msgs = self._compose_msgs(input, config, enable_structured_output=False)
169
+ result = await self.language_model_service.complete_async(
170
+ messages=msgs,
171
+ model_name=config.language_model.name,
172
+ other_options=config.additional_llm_options,
173
+ )
174
+
175
+ result_content = result.choices[0].message.content
176
+ if not result_content or not isinstance(result_content, str):
177
+ error_message = "Context relevancy evaluation did not return a result."
178
+ raise EvaluatorException(
179
+ error_message=error_message,
180
+ user_message=error_message,
181
+ )
182
+
183
+ return parse_eval_metric_result(
184
+ result_content, EvaluationMetricName.CONTEXT_RELEVANCY
185
+ )
186
+
187
+ def _compose_msgs(
188
+ self,
189
+ input: EvaluationMetricInput,
190
+ config: EvaluationMetricConfig,
191
+ enable_structured_output: bool,
192
+ ) -> LanguageModelMessages:
193
+ """
194
+ Composes the messages for the relevancy metric.
195
+ """
196
+ system_msg_content = self._get_system_prompt(config, enable_structured_output)
197
+ system_msg = Prompt(system_msg_content).to_system_msg()
198
+
199
+ user_msg = Prompt(
200
+ self._get_user_prompt(config, enable_structured_output),
201
+ input_text=input.input_text,
202
+ context_texts=input.get_joined_context_texts(),
203
+ ).to_user_msg()
204
+
205
+ return LanguageModelMessages([system_msg, user_msg])
206
+
207
+ def _get_system_prompt(
208
+ self,
209
+ config: EvaluationMetricConfig,
210
+ enable_structured_output: bool,
211
+ ):
212
+ if (
213
+ enable_structured_output
214
+ and ModelCapabilities.STRUCTURED_OUTPUT
215
+ in config.language_model.capabilities
216
+ ):
217
+ return config.custom_prompts.setdefault(
218
+ SYSTEM_MSG_KEY,
219
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT,
220
+ )
221
+ else:
222
+ return config.custom_prompts.setdefault(
223
+ SYSTEM_MSG_KEY,
224
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
225
+ )
226
+
227
+ def _get_user_prompt(
228
+ self,
229
+ config: EvaluationMetricConfig,
230
+ enable_structured_output: bool,
231
+ ):
232
+ if enable_structured_output:
233
+ return config.custom_prompts.setdefault(
234
+ USER_MSG_KEY,
235
+ CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT,
236
+ )
237
+ else:
238
+ return config.custom_prompts.setdefault(
239
+ USER_MSG_KEY,
240
+ CONTEXT_RELEVANCY_METRIC_USER_MSG,
241
+ )
@@ -0,0 +1,61 @@
1
+ from typing import Any
2
+
3
+ from pydantic import Field
4
+
5
+ from unique_toolkit._common.validators import LMI
6
+ from unique_toolkit.evals.config import EvaluationMetricConfig
7
+ from unique_toolkit.evals.hallucination.prompts import (
8
+ HALLUCINATION_METRIC_SYSTEM_MSG,
9
+ HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
10
+ HALLUCINATION_METRIC_USER_MSG,
11
+ HALLUCINATION_METRIC_USER_MSG_DEFAULT,
12
+ )
13
+ from unique_toolkit.evals.schemas import (
14
+ EvaluationMetricInputFieldName,
15
+ EvaluationMetricName,
16
+ )
17
+ from unique_toolkit.language_model.infos import LanguageModelInfo, LanguageModelName
18
+
19
+
20
+ SYSTEM_MSG_KEY = "systemPrompt"
21
+ USER_MSG_KEY = "userPrompt"
22
+ SYSTEM_MSG_DEFAULT_KEY = "systemPromptDefault"
23
+ USER_MSG_DEFAULT_KEY = "userPromptDefault"
24
+
25
+
26
+ class HallucinationConfig(EvaluationMetricConfig):
27
+ enabled: bool = False
28
+ name: EvaluationMetricName = EvaluationMetricName.HALLUCINATION
29
+ language_model: LMI = LanguageModelInfo.from_name(
30
+ LanguageModelName.AZURE_GPT_35_TURBO_0125,
31
+ )
32
+ additional_llm_options: dict[str, Any] = Field(
33
+ default={},
34
+ description="Additional options to pass to the language model.",
35
+ )
36
+ custom_prompts: dict = {
37
+ SYSTEM_MSG_KEY: HALLUCINATION_METRIC_SYSTEM_MSG,
38
+ USER_MSG_KEY: HALLUCINATION_METRIC_USER_MSG,
39
+ SYSTEM_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
40
+ USER_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_USER_MSG_DEFAULT,
41
+ }
42
+ score_to_label: dict = {
43
+ "LOW": "GREEN",
44
+ "MEDIUM": "YELLOW",
45
+ "HIGH": "RED",
46
+ }
47
+ score_to_title: dict = {
48
+ "LOW": "No Hallucination Detected",
49
+ "MEDIUM": "Hallucination Warning",
50
+ "HIGH": "High Hallucination",
51
+ }
52
+
53
+
54
+ hallucination_metric_default_config = HallucinationConfig()
55
+
56
+ hallucination_required_input_fields = [
57
+ EvaluationMetricInputFieldName.INPUT_TEXT,
58
+ EvaluationMetricInputFieldName.CONTEXT_TEXTS,
59
+ EvaluationMetricInputFieldName.HISTORY_MESSAGES,
60
+ EvaluationMetricInputFieldName.OUTPUT_TEXT,
61
+ ]
@@ -0,0 +1,92 @@
1
+ from typing import Any
2
+
3
+ from unique_toolkit.unique_toolkit.app.schemas import ChatEvent
4
+ from unique_toolkit.unique_toolkit.chat.schemas import (
5
+ ChatMessageAssessmentLabel,
6
+ ChatMessageAssessmentStatus,
7
+ ChatMessageAssessmentType,
8
+ )
9
+ from unique_toolkit.unique_toolkit.evals.config import EvaluationMetricConfig
10
+ from unique_toolkit.unique_toolkit.evals.evaluation_manager import Evaluation
11
+ from unique_toolkit.unique_toolkit.evals.hallucination.utils import check_hallucination
12
+ from unique_toolkit.unique_toolkit.evals.schemas import (
13
+ EvaluationAssessmentMessage,
14
+ EvaluationMetricInput,
15
+ EvaluationMetricName,
16
+ EvaluationMetricResult,
17
+ )
18
+ from unique_toolkit.unique_toolkit.evals.hallucination.constants import (
19
+ HallucinationConfig,
20
+ )
21
+ from unique_toolkit.unique_toolkit.reference_manager.reference_manager import (
22
+ ReferenceManager,
23
+ )
24
+
25
+ from unique_toolkit.language_model.schemas import (
26
+ LanguageModelStreamResponse,
27
+ )
28
+
29
+
30
+ class HallucinationEvaluation(Evaluation):
31
+ def __init__(
32
+ self,
33
+ config: HallucinationConfig,
34
+ event: ChatEvent,
35
+ reference_manager: ReferenceManager,
36
+ ):
37
+ self.config = config
38
+ self._company_id = event.company_id
39
+ self._user_id = event.user_id
40
+ self._reference_manager = reference_manager
41
+ self._user_message = event.payload.user_message.text
42
+ super().__init__(EvaluationMetricName.HALLUCINATION)
43
+
44
+ async def run(
45
+ self, loop_response: LanguageModelStreamResponse
46
+ ) -> EvaluationMetricResult: # type: ignore
47
+ chunks = self._reference_manager.get_chunks()
48
+
49
+ evaluation_result: EvaluationMetricResult = await check_hallucination(
50
+ company_id=self._company_id,
51
+ input=EvaluationMetricInput(
52
+ input_text=self._user_message,
53
+ context_texts=[context.text for context in chunks],
54
+ history_messages=[], # TODO include loop_history messages
55
+ output_text=loop_response.message.text,
56
+ ),
57
+ config=self.config,
58
+ )
59
+
60
+ score_to_label = self.config.score_to_label
61
+ evaluation_result.is_positive = (
62
+ score_to_label.get(evaluation_result.value.upper(), "RED") != "RED"
63
+ )
64
+ return evaluation_result
65
+
66
+ def get_assessment_type(self) -> ChatMessageAssessmentType:
67
+ return ChatMessageAssessmentType.HALLUCINATION
68
+
69
+ async def evaluation_metric_to_assessment(
70
+ self, evaluation_result: EvaluationMetricResult
71
+ ) -> EvaluationAssessmentMessage:
72
+ title = self.config.score_to_title.get(
73
+ evaluation_result.value.upper(), evaluation_result.value
74
+ )
75
+ label = ChatMessageAssessmentLabel(
76
+ self.config.score_to_label.get(
77
+ evaluation_result.value.upper(), evaluation_result.value.upper()
78
+ )
79
+ )
80
+ status = (
81
+ ChatMessageAssessmentStatus.DONE
82
+ if not evaluation_result.error
83
+ else ChatMessageAssessmentStatus.ERROR
84
+ )
85
+
86
+ return EvaluationAssessmentMessage(
87
+ status=status,
88
+ title=title,
89
+ explanation=evaluation_result.reason,
90
+ label=label,
91
+ type=self.get_assessment_type(),
92
+ )
@@ -0,0 +1,79 @@
1
+ HALLUCINATION_METRIC_SYSTEM_MSG = """
2
+ You will receive a question, references, a conversation between a user and an agent, and an output.
3
+ The output is the answer to the question.
4
+ Your task is to evaluate if the output is fully supported by the information provided in the references and conversation, and provide explanations on your judgement in 2 sentences.
5
+
6
+ Use the following entailment scale to generate a score:
7
+ [low] - All information in output is supported by the references/conversation, or extractions from the references/conversation.
8
+ [medium] - The output is supported by the references/conversation to some extent, but there is at least some information in the output that is not discussed in the references/conversation. For example, if an instruction asks about two concepts and the references/conversation only discusses either of them, it should be considered a [medium] hallucination level.
9
+ [high] - The output contains information that is not part of the references/conversation, is unrelated to the references/conversation, or contradicts the references/conversation.
10
+
11
+ Make sure to not use any external information/knowledge to judge whether the output is true or not. Only check whether the output is supported by the references/conversation, and not whether the output is correct or not. Also do not evaluate if the references/conversation contain further information that is not part of the output but could be relevant to the question. If the output mentions a plot or chart, ignore this information in your evaluation.
12
+
13
+ Your answer must be in JSON format:
14
+ {
15
+ "reason": Your explanation of your judgement of the evaluation,
16
+ "value": decision, must be one of the following: ["high", "medium", "low"]
17
+ }
18
+ """
19
+
20
+ HALLUCINATION_METRIC_USER_MSG = """
21
+ Here is the data:
22
+
23
+ Input:
24
+ '''
25
+ $input_text
26
+ '''
27
+
28
+ References:
29
+ '''
30
+ $contexts_text
31
+ '''
32
+
33
+ Conversation:
34
+ '''
35
+ $history_messages_text
36
+ '''
37
+
38
+ Output:
39
+ '''
40
+ $output_text
41
+ '''
42
+
43
+ Answer as JSON:
44
+ """
45
+
46
+ HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT = """
47
+ You will receive a question and an output.
48
+ The output is the answer to the question.
49
+ The situation is that no references could be found to answer the question. Your task is to evaluate if the output contains any information to answer the question,
50
+ and provide a short explanations of your reasoning in 2 sentences. Also mention in your explanation that no references were provided to answer the question.
51
+
52
+ Use the following entailment scale to generate a score:
53
+ [low] - The output does not contain any information to answer the question.
54
+ [medium] - The output contains some information to answer the question, but does not answer the question entirely.
55
+ [high] - The output answers the question.
56
+
57
+ It is not considered an answer when the output relates to the questions subject. Make sure to not use any external information/knowledge to judge whether the output is true or not. Only check that the output does not answer the question, and not whether the output is correct or not.
58
+ Your answer must be in JSON format:
59
+ {
60
+ "reason": Your explanation of your reasoning of the evaluation,
61
+ "value": decision, must be one of the following: ["low", "medium", "high"]
62
+ }
63
+ """
64
+
65
+ HALLUCINATION_METRIC_USER_MSG_DEFAULT = """
66
+ Here is the data:
67
+
68
+ Input:
69
+ '''
70
+ $input_text
71
+ '''
72
+
73
+ Output:
74
+ '''
75
+ $output_text
76
+ '''
77
+
78
+ Answer as JSON:
79
+ """