unique_toolkit 0.7.7__py3-none-any.whl → 1.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unique_toolkit might be problematic. Click here for more details.

Files changed (166) hide show
  1. unique_toolkit/__init__.py +28 -1
  2. unique_toolkit/_common/api_calling/human_verification_manager.py +343 -0
  3. unique_toolkit/_common/base_model_type_attribute.py +303 -0
  4. unique_toolkit/_common/chunk_relevancy_sorter/config.py +49 -0
  5. unique_toolkit/_common/chunk_relevancy_sorter/exception.py +5 -0
  6. unique_toolkit/_common/chunk_relevancy_sorter/schemas.py +46 -0
  7. unique_toolkit/_common/chunk_relevancy_sorter/service.py +374 -0
  8. unique_toolkit/_common/chunk_relevancy_sorter/tests/test_service.py +275 -0
  9. unique_toolkit/_common/default_language_model.py +12 -0
  10. unique_toolkit/_common/docx_generator/__init__.py +7 -0
  11. unique_toolkit/_common/docx_generator/config.py +12 -0
  12. unique_toolkit/_common/docx_generator/schemas.py +80 -0
  13. unique_toolkit/_common/docx_generator/service.py +252 -0
  14. unique_toolkit/_common/docx_generator/template/Doc Template.docx +0 -0
  15. unique_toolkit/_common/endpoint_builder.py +305 -0
  16. unique_toolkit/_common/endpoint_requestor.py +430 -0
  17. unique_toolkit/_common/exception.py +24 -0
  18. unique_toolkit/_common/feature_flags/schema.py +9 -0
  19. unique_toolkit/_common/pydantic/rjsf_tags.py +936 -0
  20. unique_toolkit/_common/pydantic_helpers.py +154 -0
  21. unique_toolkit/_common/referencing.py +53 -0
  22. unique_toolkit/_common/string_utilities.py +140 -0
  23. unique_toolkit/_common/tests/test_referencing.py +521 -0
  24. unique_toolkit/_common/tests/test_string_utilities.py +506 -0
  25. unique_toolkit/_common/token/image_token_counting.py +67 -0
  26. unique_toolkit/_common/token/token_counting.py +204 -0
  27. unique_toolkit/_common/utils/__init__.py +1 -0
  28. unique_toolkit/_common/utils/files.py +43 -0
  29. unique_toolkit/_common/utils/structured_output/__init__.py +1 -0
  30. unique_toolkit/_common/utils/structured_output/schema.py +5 -0
  31. unique_toolkit/_common/utils/write_configuration.py +51 -0
  32. unique_toolkit/_common/validators.py +101 -4
  33. unique_toolkit/agentic/__init__.py +1 -0
  34. unique_toolkit/agentic/debug_info_manager/debug_info_manager.py +28 -0
  35. unique_toolkit/agentic/debug_info_manager/test/test_debug_info_manager.py +278 -0
  36. unique_toolkit/agentic/evaluation/config.py +36 -0
  37. unique_toolkit/{evaluators → agentic/evaluation}/context_relevancy/prompts.py +25 -0
  38. unique_toolkit/agentic/evaluation/context_relevancy/schema.py +80 -0
  39. unique_toolkit/agentic/evaluation/context_relevancy/service.py +273 -0
  40. unique_toolkit/agentic/evaluation/evaluation_manager.py +218 -0
  41. unique_toolkit/agentic/evaluation/hallucination/constants.py +61 -0
  42. unique_toolkit/agentic/evaluation/hallucination/hallucination_evaluation.py +111 -0
  43. unique_toolkit/{evaluators → agentic/evaluation}/hallucination/prompts.py +1 -1
  44. unique_toolkit/{evaluators → agentic/evaluation}/hallucination/service.py +16 -15
  45. unique_toolkit/{evaluators → agentic/evaluation}/hallucination/utils.py +30 -20
  46. unique_toolkit/{evaluators → agentic/evaluation}/output_parser.py +20 -2
  47. unique_toolkit/{evaluators → agentic/evaluation}/schemas.py +27 -7
  48. unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +253 -0
  49. unique_toolkit/agentic/evaluation/tests/test_output_parser.py +87 -0
  50. unique_toolkit/agentic/history_manager/history_construction_with_contents.py +297 -0
  51. unique_toolkit/agentic/history_manager/history_manager.py +242 -0
  52. unique_toolkit/agentic/history_manager/loop_token_reducer.py +484 -0
  53. unique_toolkit/agentic/history_manager/utils.py +96 -0
  54. unique_toolkit/agentic/postprocessor/postprocessor_manager.py +212 -0
  55. unique_toolkit/agentic/reference_manager/reference_manager.py +103 -0
  56. unique_toolkit/agentic/responses_api/__init__.py +19 -0
  57. unique_toolkit/agentic/responses_api/postprocessors/code_display.py +63 -0
  58. unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +145 -0
  59. unique_toolkit/agentic/responses_api/stream_handler.py +15 -0
  60. unique_toolkit/agentic/short_term_memory_manager/persistent_short_term_memory_manager.py +141 -0
  61. unique_toolkit/agentic/thinking_manager/thinking_manager.py +103 -0
  62. unique_toolkit/agentic/tools/__init__.py +1 -0
  63. unique_toolkit/agentic/tools/a2a/__init__.py +36 -0
  64. unique_toolkit/agentic/tools/a2a/config.py +17 -0
  65. unique_toolkit/agentic/tools/a2a/evaluation/__init__.py +15 -0
  66. unique_toolkit/agentic/tools/a2a/evaluation/_utils.py +66 -0
  67. unique_toolkit/agentic/tools/a2a/evaluation/config.py +55 -0
  68. unique_toolkit/agentic/tools/a2a/evaluation/evaluator.py +260 -0
  69. unique_toolkit/agentic/tools/a2a/evaluation/summarization_user_message.j2 +9 -0
  70. unique_toolkit/agentic/tools/a2a/manager.py +55 -0
  71. unique_toolkit/agentic/tools/a2a/postprocessing/__init__.py +21 -0
  72. unique_toolkit/agentic/tools/a2a/postprocessing/_display_utils.py +185 -0
  73. unique_toolkit/agentic/tools/a2a/postprocessing/_ref_utils.py +73 -0
  74. unique_toolkit/agentic/tools/a2a/postprocessing/config.py +45 -0
  75. unique_toolkit/agentic/tools/a2a/postprocessing/display.py +180 -0
  76. unique_toolkit/agentic/tools/a2a/postprocessing/references.py +101 -0
  77. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_display_utils.py +1335 -0
  78. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_ref_utils.py +603 -0
  79. unique_toolkit/agentic/tools/a2a/prompts.py +46 -0
  80. unique_toolkit/agentic/tools/a2a/response_watcher/__init__.py +6 -0
  81. unique_toolkit/agentic/tools/a2a/response_watcher/service.py +91 -0
  82. unique_toolkit/agentic/tools/a2a/tool/__init__.py +4 -0
  83. unique_toolkit/agentic/tools/a2a/tool/_memory.py +26 -0
  84. unique_toolkit/agentic/tools/a2a/tool/_schema.py +9 -0
  85. unique_toolkit/agentic/tools/a2a/tool/config.py +73 -0
  86. unique_toolkit/agentic/tools/a2a/tool/service.py +306 -0
  87. unique_toolkit/agentic/tools/agent_chunks_hanlder.py +65 -0
  88. unique_toolkit/agentic/tools/config.py +167 -0
  89. unique_toolkit/agentic/tools/factory.py +44 -0
  90. unique_toolkit/agentic/tools/mcp/__init__.py +4 -0
  91. unique_toolkit/agentic/tools/mcp/manager.py +71 -0
  92. unique_toolkit/agentic/tools/mcp/models.py +28 -0
  93. unique_toolkit/agentic/tools/mcp/tool_wrapper.py +234 -0
  94. unique_toolkit/agentic/tools/openai_builtin/__init__.py +11 -0
  95. unique_toolkit/agentic/tools/openai_builtin/base.py +30 -0
  96. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/__init__.py +8 -0
  97. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/config.py +57 -0
  98. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/service.py +230 -0
  99. unique_toolkit/agentic/tools/openai_builtin/manager.py +62 -0
  100. unique_toolkit/agentic/tools/schemas.py +141 -0
  101. unique_toolkit/agentic/tools/test/test_mcp_manager.py +536 -0
  102. unique_toolkit/agentic/tools/test/test_tool_progress_reporter.py +445 -0
  103. unique_toolkit/agentic/tools/tool.py +183 -0
  104. unique_toolkit/agentic/tools/tool_manager.py +523 -0
  105. unique_toolkit/agentic/tools/tool_progress_reporter.py +285 -0
  106. unique_toolkit/agentic/tools/utils/__init__.py +19 -0
  107. unique_toolkit/agentic/tools/utils/execution/__init__.py +1 -0
  108. unique_toolkit/agentic/tools/utils/execution/execution.py +286 -0
  109. unique_toolkit/agentic/tools/utils/source_handling/__init__.py +0 -0
  110. unique_toolkit/agentic/tools/utils/source_handling/schema.py +21 -0
  111. unique_toolkit/agentic/tools/utils/source_handling/source_formatting.py +207 -0
  112. unique_toolkit/agentic/tools/utils/source_handling/tests/test_source_formatting.py +216 -0
  113. unique_toolkit/app/__init__.py +6 -0
  114. unique_toolkit/app/dev_util.py +180 -0
  115. unique_toolkit/app/init_sdk.py +32 -1
  116. unique_toolkit/app/schemas.py +198 -31
  117. unique_toolkit/app/unique_settings.py +367 -0
  118. unique_toolkit/chat/__init__.py +8 -1
  119. unique_toolkit/chat/deprecated/service.py +232 -0
  120. unique_toolkit/chat/functions.py +642 -77
  121. unique_toolkit/chat/rendering.py +34 -0
  122. unique_toolkit/chat/responses_api.py +461 -0
  123. unique_toolkit/chat/schemas.py +133 -2
  124. unique_toolkit/chat/service.py +115 -767
  125. unique_toolkit/content/functions.py +153 -4
  126. unique_toolkit/content/schemas.py +122 -15
  127. unique_toolkit/content/service.py +278 -44
  128. unique_toolkit/content/smart_rules.py +301 -0
  129. unique_toolkit/content/utils.py +8 -3
  130. unique_toolkit/embedding/service.py +102 -11
  131. unique_toolkit/framework_utilities/__init__.py +1 -0
  132. unique_toolkit/framework_utilities/langchain/client.py +71 -0
  133. unique_toolkit/framework_utilities/langchain/history.py +19 -0
  134. unique_toolkit/framework_utilities/openai/__init__.py +6 -0
  135. unique_toolkit/framework_utilities/openai/client.py +83 -0
  136. unique_toolkit/framework_utilities/openai/message_builder.py +229 -0
  137. unique_toolkit/framework_utilities/utils.py +23 -0
  138. unique_toolkit/language_model/__init__.py +3 -0
  139. unique_toolkit/language_model/builder.py +27 -11
  140. unique_toolkit/language_model/default_language_model.py +3 -0
  141. unique_toolkit/language_model/functions.py +327 -43
  142. unique_toolkit/language_model/infos.py +992 -50
  143. unique_toolkit/language_model/reference.py +242 -0
  144. unique_toolkit/language_model/schemas.py +475 -48
  145. unique_toolkit/language_model/service.py +228 -27
  146. unique_toolkit/protocols/support.py +145 -0
  147. unique_toolkit/services/__init__.py +7 -0
  148. unique_toolkit/services/chat_service.py +1630 -0
  149. unique_toolkit/services/knowledge_base.py +861 -0
  150. unique_toolkit/short_term_memory/service.py +178 -41
  151. unique_toolkit/smart_rules/__init__.py +0 -0
  152. unique_toolkit/smart_rules/compile.py +56 -0
  153. unique_toolkit/test_utilities/events.py +197 -0
  154. {unique_toolkit-0.7.7.dist-info → unique_toolkit-1.23.0.dist-info}/METADATA +606 -7
  155. unique_toolkit-1.23.0.dist-info/RECORD +182 -0
  156. unique_toolkit/evaluators/__init__.py +0 -1
  157. unique_toolkit/evaluators/config.py +0 -35
  158. unique_toolkit/evaluators/constants.py +0 -1
  159. unique_toolkit/evaluators/context_relevancy/constants.py +0 -32
  160. unique_toolkit/evaluators/context_relevancy/service.py +0 -53
  161. unique_toolkit/evaluators/context_relevancy/utils.py +0 -142
  162. unique_toolkit/evaluators/hallucination/constants.py +0 -41
  163. unique_toolkit-0.7.7.dist-info/RECORD +0 -64
  164. /unique_toolkit/{evaluators → agentic/evaluation}/exception.py +0 -0
  165. {unique_toolkit-0.7.7.dist-info → unique_toolkit-1.23.0.dist-info}/LICENSE +0 -0
  166. {unique_toolkit-0.7.7.dist-info → unique_toolkit-1.23.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,273 @@
1
+ import logging
2
+ from typing import overload
3
+
4
+ from pydantic import BaseModel, ValidationError
5
+ from typing_extensions import deprecated
6
+
7
+ from unique_toolkit._common.validate_required_values import (
8
+ validate_required_values,
9
+ )
10
+ from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
11
+ from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
12
+ EvaluationSchemaStructuredOutput,
13
+ )
14
+ from unique_toolkit.agentic.evaluation.exception import EvaluatorException
15
+ from unique_toolkit.agentic.evaluation.output_parser import (
16
+ parse_eval_metric_result,
17
+ parse_eval_metric_result_structured_output,
18
+ )
19
+ from unique_toolkit.agentic.evaluation.schemas import (
20
+ EvaluationMetricInput,
21
+ EvaluationMetricInputFieldName,
22
+ EvaluationMetricName,
23
+ EvaluationMetricResult,
24
+ )
25
+ from unique_toolkit.app.schemas import BaseEvent, ChatEvent
26
+ from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
27
+ from unique_toolkit.language_model.infos import (
28
+ LanguageModelInfo,
29
+ ModelCapabilities,
30
+ )
31
+ from unique_toolkit.language_model.prompt import Prompt
32
+ from unique_toolkit.language_model.schemas import (
33
+ LanguageModelMessages,
34
+ )
35
+ from unique_toolkit.language_model.service import (
36
+ LanguageModelService,
37
+ )
38
+
39
+ from .prompts import (
40
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
41
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT,
42
+ CONTEXT_RELEVANCY_METRIC_USER_MSG,
43
+ CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT,
44
+ )
45
+
46
+ SYSTEM_MSG_KEY = "systemPrompt"
47
+ USER_MSG_KEY = "userPrompt"
48
+
49
+ default_config = EvaluationMetricConfig(
50
+ enabled=False,
51
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
52
+ language_model=LanguageModelInfo.from_name(DEFAULT_GPT_4o),
53
+ custom_prompts={
54
+ SYSTEM_MSG_KEY: CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
55
+ USER_MSG_KEY: CONTEXT_RELEVANCY_METRIC_USER_MSG,
56
+ },
57
+ )
58
+
59
+ relevancy_required_input_fields = [
60
+ EvaluationMetricInputFieldName.INPUT_TEXT,
61
+ EvaluationMetricInputFieldName.CONTEXT_TEXTS,
62
+ ]
63
+
64
+
65
+ class ContextRelevancyEvaluator:
66
+ @deprecated(
67
+ "Use __init__ with company_id and user_id instead or use the classmethod `from_event`"
68
+ )
69
+ @overload
70
+ def __init__(self, event: ChatEvent | BaseEvent):
71
+ """
72
+ Initialize the ContextRelevancyEvaluator with an event (deprecated)
73
+ """
74
+
75
+ @overload
76
+ def __init__(self, *, company_id: str, user_id: str):
77
+ """
78
+ Initialize the ContextRelevancyEvaluator with a company_id and user_id
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ event: ChatEvent | BaseEvent | None = None,
84
+ company_id: str | None = None,
85
+ user_id: str | None = None,
86
+ ):
87
+ if isinstance(event, (ChatEvent, BaseEvent)):
88
+ self.language_model_service = LanguageModelService.from_event(event)
89
+ else:
90
+ [company_id, user_id] = validate_required_values([company_id, user_id])
91
+ self.language_model_service = LanguageModelService(
92
+ company_id=company_id, user_id=user_id
93
+ )
94
+
95
+ # Setup the logger
96
+ module_name = "ContextRelevancyEvaluator"
97
+ self.logger = logging.getLogger(f"{module_name}.{__name__}")
98
+
99
+ @classmethod
100
+ def from_event(cls, event: ChatEvent | BaseEvent):
101
+ return cls(company_id=event.company_id, user_id=event.user_id)
102
+
103
+ async def analyze(
104
+ self,
105
+ input: EvaluationMetricInput,
106
+ config: EvaluationMetricConfig = default_config,
107
+ structured_output_schema: type[BaseModel] | None = None,
108
+ ) -> EvaluationMetricResult | None:
109
+ """
110
+ Analyzes the level of relevancy of a context by comparing
111
+ it with the input text.
112
+
113
+ Args:
114
+ input (EvaluationMetricInput): The input for the metric.
115
+ config (EvaluationMetricConfig): The configuration for the metric.
116
+
117
+ Returns:
118
+ EvaluationMetricResult | None
119
+
120
+ Raises:
121
+ EvaluatorException: If the context texts are empty or required fields are missing or error occurred during evaluation.
122
+ """
123
+ if config.enabled is False:
124
+ self.logger.info("Hallucination metric is not enabled.")
125
+ return None
126
+
127
+ input.validate_required_fields(relevancy_required_input_fields)
128
+
129
+ # TODO: Was already there in monorepo
130
+ if len(input.context_texts) == 0: # type: ignore
131
+ error_message = "No context texts provided."
132
+ raise EvaluatorException(
133
+ user_message=error_message,
134
+ error_message=error_message,
135
+ )
136
+
137
+ try:
138
+ # Handle structured output if enabled and supported by the model
139
+ if (
140
+ structured_output_schema
141
+ and ModelCapabilities.STRUCTURED_OUTPUT
142
+ in config.language_model.capabilities
143
+ ):
144
+ return await self._handle_structured_output(
145
+ input, config, structured_output_schema
146
+ )
147
+
148
+ # Handle regular output
149
+ return await self._handle_regular_output(input, config)
150
+
151
+ except Exception as e:
152
+ error_message = (
153
+ "Unknown error occurred during context relevancy metric analysis"
154
+ )
155
+ raise EvaluatorException(
156
+ error_message=f"{error_message}: {e}",
157
+ user_message=error_message,
158
+ exception=e,
159
+ )
160
+
161
+ async def _handle_structured_output(
162
+ self,
163
+ input: EvaluationMetricInput,
164
+ config: EvaluationMetricConfig,
165
+ structured_output_schema: type[BaseModel],
166
+ ) -> EvaluationMetricResult:
167
+ """Handle the structured output case for context relevancy evaluation."""
168
+ self.logger.info("Using structured output for context relevancy evaluation.")
169
+ msgs = self._compose_msgs(input, config, enable_structured_output=True)
170
+ result = await self.language_model_service.complete_async(
171
+ messages=msgs,
172
+ model_name=config.language_model.name,
173
+ structured_output_model=structured_output_schema,
174
+ structured_output_enforce_schema=True,
175
+ other_options=config.additional_llm_options,
176
+ )
177
+
178
+ try:
179
+ result_content = EvaluationSchemaStructuredOutput.model_validate(
180
+ result.choices[0].message.parsed
181
+ )
182
+ except ValidationError as e:
183
+ error_message = "Error occurred during structured output validation of the context relevancy evaluation."
184
+ raise EvaluatorException(
185
+ error_message=error_message,
186
+ user_message=error_message,
187
+ exception=e,
188
+ )
189
+
190
+ return parse_eval_metric_result_structured_output(
191
+ result_content, EvaluationMetricName.CONTEXT_RELEVANCY
192
+ )
193
+
194
+ async def _handle_regular_output(
195
+ self,
196
+ input: EvaluationMetricInput,
197
+ config: EvaluationMetricConfig,
198
+ ) -> EvaluationMetricResult:
199
+ """Handle the regular output case for context relevancy evaluation."""
200
+ msgs = self._compose_msgs(input, config, enable_structured_output=False)
201
+ result = await self.language_model_service.complete_async(
202
+ messages=msgs,
203
+ model_name=config.language_model.name,
204
+ other_options=config.additional_llm_options,
205
+ )
206
+
207
+ result_content = result.choices[0].message.content
208
+ if not result_content or not isinstance(result_content, str):
209
+ error_message = "Context relevancy evaluation did not return a result."
210
+ raise EvaluatorException(
211
+ error_message=error_message,
212
+ user_message=error_message,
213
+ )
214
+
215
+ return parse_eval_metric_result(
216
+ result_content, EvaluationMetricName.CONTEXT_RELEVANCY
217
+ )
218
+
219
+ def _compose_msgs(
220
+ self,
221
+ input: EvaluationMetricInput,
222
+ config: EvaluationMetricConfig,
223
+ enable_structured_output: bool,
224
+ ) -> LanguageModelMessages:
225
+ """
226
+ Composes the messages for the relevancy metric.
227
+ """
228
+ system_msg_content = self._get_system_prompt(config, enable_structured_output)
229
+ system_msg = Prompt(system_msg_content).to_system_msg()
230
+
231
+ user_msg = Prompt(
232
+ self._get_user_prompt(config, enable_structured_output),
233
+ input_text=input.input_text,
234
+ context_texts=input.get_joined_context_texts(),
235
+ ).to_user_msg()
236
+
237
+ return LanguageModelMessages([system_msg, user_msg])
238
+
239
+ def _get_system_prompt(
240
+ self,
241
+ config: EvaluationMetricConfig,
242
+ enable_structured_output: bool,
243
+ ):
244
+ if (
245
+ enable_structured_output
246
+ and ModelCapabilities.STRUCTURED_OUTPUT
247
+ in config.language_model.capabilities
248
+ ):
249
+ return config.custom_prompts.setdefault(
250
+ SYSTEM_MSG_KEY,
251
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT,
252
+ )
253
+ else:
254
+ return config.custom_prompts.setdefault(
255
+ SYSTEM_MSG_KEY,
256
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
257
+ )
258
+
259
+ def _get_user_prompt(
260
+ self,
261
+ config: EvaluationMetricConfig,
262
+ enable_structured_output: bool,
263
+ ):
264
+ if enable_structured_output:
265
+ return config.custom_prompts.setdefault(
266
+ USER_MSG_KEY,
267
+ CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT,
268
+ )
269
+ else:
270
+ return config.custom_prompts.setdefault(
271
+ USER_MSG_KEY,
272
+ CONTEXT_RELEVANCY_METRIC_USER_MSG,
273
+ )
@@ -0,0 +1,218 @@
1
+ import asyncio
2
+ from abc import ABC
3
+ from logging import Logger
4
+
5
+ from unique_toolkit.agentic.evaluation.schemas import (
6
+ EvaluationAssessmentMessage,
7
+ EvaluationMetricName,
8
+ EvaluationMetricResult,
9
+ )
10
+ from unique_toolkit.agentic.tools.utils.execution.execution import (
11
+ Result,
12
+ SafeTaskExecutor,
13
+ )
14
+ from unique_toolkit.chat.schemas import (
15
+ ChatMessageAssessmentStatus,
16
+ ChatMessageAssessmentType,
17
+ )
18
+ from unique_toolkit.chat.service import ChatService
19
+ from unique_toolkit.language_model.schemas import (
20
+ LanguageModelStreamResponse,
21
+ )
22
+
23
+
24
+ class Evaluation(ABC):
25
+ """
26
+ Abstract base class for evaluation metrics.
27
+
28
+ typical use cases include:
29
+ - Hallucination checking
30
+ - compliance checking
31
+ """
32
+
33
+ def __init__(self, name: EvaluationMetricName):
34
+ self.name = name
35
+
36
+ def get_name(self) -> EvaluationMetricName:
37
+ return self.name
38
+
39
+ def get_assessment_type(self) -> ChatMessageAssessmentType:
40
+ raise NotImplementedError(
41
+ "Subclasses must implement this method to return the assessment type."
42
+ )
43
+
44
+ async def run(
45
+ self, loop_response: LanguageModelStreamResponse
46
+ ) -> EvaluationMetricResult:
47
+ raise NotImplementedError("Subclasses must implement this method.")
48
+
49
+ async def evaluation_metric_to_assessment(
50
+ self, evaluation_result: EvaluationMetricResult
51
+ ) -> EvaluationAssessmentMessage:
52
+ raise NotImplementedError(
53
+ "Subclasses must implement this method to convert evaluation results to assessment messages."
54
+ )
55
+
56
+
57
+ class EvaluationManager:
58
+ """
59
+ Manages the evaluation metrics and executes evaluation calls.
60
+
61
+ This class is responsible for:
62
+ - Storing and managing evaluation instances, identified by their unique names.
63
+ - Executing selected evaluations asynchronously and processing their results.
64
+ - Tracking the overall success or failure of evaluations.
65
+ - Integrating with external services like logging and chat systems to display evaluation statuses and results.
66
+ - Handling errors gracefully, including missing evaluations or failed executions.
67
+
68
+ Key Features:
69
+ - Evaluation Storage: Maintains a dictionary of evaluation instances for quick retrieval.
70
+ - Asynchronous Execution: Supports concurrent execution of multiple evaluations for efficiency.
71
+ - Result Processing: Tracks evaluation outcomes and updates the internal state based on results.
72
+ - Chat Integration: Updates the chat interface with evaluation statuses and detailed assessments.
73
+ - Error Handling: Provides robust error messages and fallback mechanisms for missing or failed evaluations.
74
+
75
+ The EvaluationManager serves as the central hub for managing and executing evaluations.
76
+ """
77
+
78
+ # a hashmap to hold evaluations by their names
79
+
80
+ def __init__(
81
+ self,
82
+ logger: Logger,
83
+ chat_service: ChatService,
84
+ ):
85
+ self._logger = logger
86
+ self._chat_service = chat_service
87
+ self._evaluations: dict[EvaluationMetricName, Evaluation] = {}
88
+ self._evaluation_passed: bool = True
89
+
90
+ def add_evaluation(self, evaluation: Evaluation):
91
+ self._evaluations[evaluation.get_name()] = evaluation
92
+
93
+ def get_evaluation_by_name(self, name: EvaluationMetricName) -> Evaluation | None:
94
+ return self._evaluations.get(name)
95
+
96
+ async def run_evaluations(
97
+ self,
98
+ selected_evaluation_names: list[EvaluationMetricName],
99
+ loop_response: LanguageModelStreamResponse,
100
+ assistant_message_id: str,
101
+ ) -> list[EvaluationMetricResult]:
102
+ task_executor = SafeTaskExecutor(
103
+ logger=self._logger,
104
+ )
105
+
106
+ tasks = [
107
+ task_executor.execute_async(
108
+ self.execute_evaluation_call,
109
+ loop_response=loop_response,
110
+ evaluation_name=evaluation_name,
111
+ assistant_message_id=assistant_message_id,
112
+ )
113
+ for evaluation_name in selected_evaluation_names
114
+ ]
115
+ evaluation_results = await asyncio.gather(*tasks)
116
+ evaluation_results_unpacked: list[EvaluationMetricResult] = []
117
+
118
+ for i, result in enumerate(evaluation_results):
119
+ unpacked_evaluation_result = self._create_evaluation_metric_result(
120
+ result, selected_evaluation_names[i]
121
+ )
122
+ if not unpacked_evaluation_result.is_positive:
123
+ self._evaluation_passed = False
124
+ evaluation_results_unpacked.append(unpacked_evaluation_result)
125
+
126
+ for evaluation_name, evaluation_result in zip(
127
+ selected_evaluation_names, evaluation_results_unpacked
128
+ ):
129
+ evaluation_instance = self.get_evaluation_by_name(evaluation_name)
130
+ if evaluation_instance:
131
+ await self._show_message_assessment(
132
+ evaluation_instance, evaluation_result, assistant_message_id
133
+ )
134
+
135
+ return evaluation_results_unpacked
136
+
137
+ async def execute_evaluation_call(
138
+ self,
139
+ evaluation_name: EvaluationMetricName,
140
+ loop_response: LanguageModelStreamResponse,
141
+ assistant_message_id: str,
142
+ ) -> EvaluationMetricResult:
143
+ self._logger.info(f"Processing tool call: {evaluation_name}")
144
+
145
+ evaluation_instance = self.get_evaluation_by_name(evaluation_name)
146
+
147
+ if evaluation_instance:
148
+ # Execute the evaluation
149
+ await self._create_assistant_message(
150
+ evaluation_instance, assistant_message_id
151
+ )
152
+ evaluation_metric_result: EvaluationMetricResult = (
153
+ await evaluation_instance.run(loop_response)
154
+ )
155
+ return evaluation_metric_result
156
+
157
+ return EvaluationMetricResult(
158
+ name=evaluation_name,
159
+ is_positive=True,
160
+ value="RED",
161
+ reason=f"Evaluation named {evaluation_name} not found",
162
+ error=Exception("Evaluation named {evaluation_name} not found"),
163
+ )
164
+
165
+ def _create_evaluation_metric_result(
166
+ self,
167
+ result: Result[EvaluationMetricResult],
168
+ evaluation_name: EvaluationMetricName,
169
+ ) -> EvaluationMetricResult:
170
+ if not result.success:
171
+ return EvaluationMetricResult(
172
+ name=evaluation_name,
173
+ is_positive=True,
174
+ value="RED",
175
+ reason=str(result.exception),
176
+ error=Exception("Evaluation result is not successful"),
177
+ )
178
+ unpacked = result.unpack()
179
+ if not isinstance(unpacked, EvaluationMetricResult):
180
+ return EvaluationMetricResult(
181
+ name=evaluation_name,
182
+ is_positive=True,
183
+ value="RED",
184
+ reason="Evaluation result is not of type EvaluationMetricResult",
185
+ error=Exception(
186
+ "Evaluation result is not of type EvaluationMetricResult"
187
+ ),
188
+ )
189
+ return unpacked
190
+
191
+ async def _show_message_assessment(
192
+ self,
193
+ evaluation_instance: Evaluation,
194
+ evaluation_metric_result: EvaluationMetricResult,
195
+ assistant_message_id: str,
196
+ ) -> None:
197
+ evaluation_assessment_message = (
198
+ await evaluation_instance.evaluation_metric_to_assessment(
199
+ evaluation_metric_result
200
+ )
201
+ )
202
+ await self._chat_service.modify_message_assessment_async(
203
+ assistant_message_id=assistant_message_id,
204
+ status=evaluation_assessment_message.status,
205
+ title=evaluation_assessment_message.title,
206
+ explanation=evaluation_assessment_message.explanation,
207
+ label=evaluation_assessment_message.label,
208
+ type=evaluation_assessment_message.type,
209
+ )
210
+
211
+ async def _create_assistant_message(
212
+ self, evaluation_instance: Evaluation, assistant_message_id: str
213
+ ):
214
+ await self._chat_service.create_message_assessment_async(
215
+ assistant_message_id=assistant_message_id,
216
+ status=ChatMessageAssessmentStatus.PENDING,
217
+ type=evaluation_instance.get_assessment_type(),
218
+ )
@@ -0,0 +1,61 @@
1
+ from typing import Any
2
+
3
+ from pydantic import Field
4
+
5
+ from unique_toolkit._common.validators import LMI
6
+ from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
7
+ from unique_toolkit.agentic.evaluation.hallucination.prompts import (
8
+ HALLUCINATION_METRIC_SYSTEM_MSG,
9
+ HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
10
+ HALLUCINATION_METRIC_USER_MSG,
11
+ HALLUCINATION_METRIC_USER_MSG_DEFAULT,
12
+ )
13
+ from unique_toolkit.agentic.evaluation.schemas import (
14
+ EvaluationMetricInputFieldName,
15
+ EvaluationMetricName,
16
+ )
17
+ from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
18
+ from unique_toolkit.language_model.infos import LanguageModelInfo
19
+
20
+ SYSTEM_MSG_KEY = "systemPrompt"
21
+ USER_MSG_KEY = "userPrompt"
22
+ SYSTEM_MSG_DEFAULT_KEY = "systemPromptDefault"
23
+ USER_MSG_DEFAULT_KEY = "userPromptDefault"
24
+
25
+
26
+ class HallucinationConfig(EvaluationMetricConfig):
27
+ enabled: bool = False
28
+ name: EvaluationMetricName = EvaluationMetricName.HALLUCINATION
29
+ language_model: LMI = LanguageModelInfo.from_name(
30
+ DEFAULT_GPT_4o,
31
+ )
32
+ additional_llm_options: dict[str, Any] = Field(
33
+ default={},
34
+ description="Additional options to pass to the language model.",
35
+ )
36
+ custom_prompts: dict = {
37
+ SYSTEM_MSG_KEY: HALLUCINATION_METRIC_SYSTEM_MSG,
38
+ USER_MSG_KEY: HALLUCINATION_METRIC_USER_MSG,
39
+ SYSTEM_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
40
+ USER_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_USER_MSG_DEFAULT,
41
+ }
42
+ score_to_label: dict = {
43
+ "LOW": "GREEN",
44
+ "MEDIUM": "YELLOW",
45
+ "HIGH": "RED",
46
+ }
47
+ score_to_title: dict = {
48
+ "LOW": "No Hallucination Detected",
49
+ "MEDIUM": "Hallucination Warning",
50
+ "HIGH": "High Hallucination",
51
+ }
52
+
53
+
54
+ hallucination_metric_default_config = HallucinationConfig()
55
+
56
+ hallucination_required_input_fields = [
57
+ EvaluationMetricInputFieldName.INPUT_TEXT,
58
+ EvaluationMetricInputFieldName.CONTEXT_TEXTS,
59
+ EvaluationMetricInputFieldName.HISTORY_MESSAGES,
60
+ EvaluationMetricInputFieldName.OUTPUT_TEXT,
61
+ ]
@@ -0,0 +1,111 @@
1
+ import regex as re
2
+
3
+ from unique_toolkit.agentic.evaluation.evaluation_manager import Evaluation
4
+ from unique_toolkit.agentic.evaluation.hallucination.constants import (
5
+ HallucinationConfig,
6
+ )
7
+ from unique_toolkit.agentic.evaluation.hallucination.utils import check_hallucination
8
+ from unique_toolkit.agentic.evaluation.schemas import (
9
+ EvaluationAssessmentMessage,
10
+ EvaluationMetricInput,
11
+ EvaluationMetricName,
12
+ EvaluationMetricResult,
13
+ )
14
+ from unique_toolkit.agentic.reference_manager.reference_manager import (
15
+ ReferenceManager,
16
+ )
17
+ from unique_toolkit.app.schemas import ChatEvent
18
+ from unique_toolkit.chat.schemas import (
19
+ ChatMessageAssessmentLabel,
20
+ ChatMessageAssessmentStatus,
21
+ ChatMessageAssessmentType,
22
+ )
23
+ from unique_toolkit.language_model.reference import _preprocess_message
24
+ from unique_toolkit.language_model.schemas import (
25
+ LanguageModelStreamResponse,
26
+ )
27
+
28
+
29
+ class HallucinationEvaluation(Evaluation):
30
+ def __init__(
31
+ self,
32
+ config: HallucinationConfig,
33
+ event: ChatEvent,
34
+ reference_manager: ReferenceManager,
35
+ ):
36
+ self.config = config
37
+ self._company_id = event.company_id
38
+ self._user_id = event.user_id
39
+ self._reference_manager = reference_manager
40
+ self._user_message = event.payload.user_message.text
41
+ super().__init__(EvaluationMetricName.HALLUCINATION)
42
+
43
+ async def run(
44
+ self, loop_response: LanguageModelStreamResponse
45
+ ) -> EvaluationMetricResult: # type: ignore
46
+ all_chunks = self._reference_manager.get_chunks()
47
+
48
+ # source numbers from original text
49
+ ref_pattern = r"\[(\d+)\]"
50
+ original_text = loop_response.message.original_text
51
+
52
+ # preprocess original text to deal with different source patterns
53
+ original_text_preprocessed = _preprocess_message(original_text)
54
+
55
+ source_number_matches = re.findall(ref_pattern, original_text_preprocessed)
56
+ source_numbers = {int(num) for num in source_number_matches}
57
+
58
+ referenced_chunks = [all_chunks[idx] for idx in source_numbers]
59
+
60
+ evaluation_result: EvaluationMetricResult = await check_hallucination(
61
+ company_id=self._company_id,
62
+ input=EvaluationMetricInput(
63
+ input_text=self._user_message,
64
+ context_texts=[context.text for context in referenced_chunks],
65
+ history_messages=[], # TODO include loop_history messages
66
+ output_text=loop_response.message.text,
67
+ ),
68
+ config=self.config,
69
+ )
70
+
71
+ score_to_label = self.config.score_to_label
72
+ evaluation_result.is_positive = (
73
+ score_to_label.get(evaluation_result.value.upper(), "RED") != "RED"
74
+ )
75
+ return evaluation_result
76
+
77
+ def get_assessment_type(self) -> ChatMessageAssessmentType:
78
+ return ChatMessageAssessmentType.HALLUCINATION
79
+
80
+ async def evaluation_metric_to_assessment(
81
+ self, evaluation_result: EvaluationMetricResult
82
+ ) -> EvaluationAssessmentMessage:
83
+ title = self.config.score_to_title.get(
84
+ evaluation_result.value.upper(), evaluation_result.value
85
+ )
86
+ label = ChatMessageAssessmentLabel(
87
+ self.config.score_to_label.get(
88
+ evaluation_result.value.upper(), evaluation_result.value.upper()
89
+ )
90
+ )
91
+ status = (
92
+ ChatMessageAssessmentStatus.DONE
93
+ if not evaluation_result.error
94
+ else ChatMessageAssessmentStatus.ERROR
95
+ )
96
+ explanation = evaluation_result.reason
97
+
98
+ if status == ChatMessageAssessmentStatus.ERROR:
99
+ title = "Hallucination Check Error"
100
+ label = ChatMessageAssessmentLabel.RED
101
+ explanation = (
102
+ "An unrecoverable error occurred while evaluating the response."
103
+ )
104
+
105
+ return EvaluationAssessmentMessage(
106
+ status=status,
107
+ title=title,
108
+ explanation=explanation,
109
+ label=label,
110
+ type=self.get_assessment_type(),
111
+ )
@@ -8,7 +8,7 @@ Use the following entailment scale to generate a score:
8
8
  [medium] - The output is supported by the references/conversation to some extent, but there is at least some information in the output that is not discussed in the references/conversation. For example, if an instruction asks about two concepts and the references/conversation only discusses either of them, it should be considered a [medium] hallucination level.
9
9
  [high] - The output contains information that is not part of the references/conversation, is unrelated to the references/conversation, or contradicts the references/conversation.
10
10
 
11
- Make sure to not use any external information/knowledge to judge whether the output is true or not. Only check whether the output is supported by the references/conversation, and not whether the output is correct or not. Also do not evaluate if the references/conversation contain further information that is not part of the output but could be relevant to the qestion.
11
+ Make sure to not use any external information/knowledge to judge whether the output is true or not. Only check whether the output is supported by the references/conversation, and not whether the output is correct or not. Also do not evaluate if the references/conversation contain further information that is not part of the output but could be relevant to the question. If the output mentions a plot or chart, ignore this information in your evaluation.
12
12
 
13
13
  Your answer must be in JSON format:
14
14
  {