unique_toolkit 0.7.7__py3-none-any.whl → 1.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unique_toolkit might be problematic. Click here for more details.

Files changed (166) hide show
  1. unique_toolkit/__init__.py +28 -1
  2. unique_toolkit/_common/api_calling/human_verification_manager.py +343 -0
  3. unique_toolkit/_common/base_model_type_attribute.py +303 -0
  4. unique_toolkit/_common/chunk_relevancy_sorter/config.py +49 -0
  5. unique_toolkit/_common/chunk_relevancy_sorter/exception.py +5 -0
  6. unique_toolkit/_common/chunk_relevancy_sorter/schemas.py +46 -0
  7. unique_toolkit/_common/chunk_relevancy_sorter/service.py +374 -0
  8. unique_toolkit/_common/chunk_relevancy_sorter/tests/test_service.py +275 -0
  9. unique_toolkit/_common/default_language_model.py +12 -0
  10. unique_toolkit/_common/docx_generator/__init__.py +7 -0
  11. unique_toolkit/_common/docx_generator/config.py +12 -0
  12. unique_toolkit/_common/docx_generator/schemas.py +80 -0
  13. unique_toolkit/_common/docx_generator/service.py +252 -0
  14. unique_toolkit/_common/docx_generator/template/Doc Template.docx +0 -0
  15. unique_toolkit/_common/endpoint_builder.py +305 -0
  16. unique_toolkit/_common/endpoint_requestor.py +430 -0
  17. unique_toolkit/_common/exception.py +24 -0
  18. unique_toolkit/_common/feature_flags/schema.py +9 -0
  19. unique_toolkit/_common/pydantic/rjsf_tags.py +936 -0
  20. unique_toolkit/_common/pydantic_helpers.py +154 -0
  21. unique_toolkit/_common/referencing.py +53 -0
  22. unique_toolkit/_common/string_utilities.py +140 -0
  23. unique_toolkit/_common/tests/test_referencing.py +521 -0
  24. unique_toolkit/_common/tests/test_string_utilities.py +506 -0
  25. unique_toolkit/_common/token/image_token_counting.py +67 -0
  26. unique_toolkit/_common/token/token_counting.py +204 -0
  27. unique_toolkit/_common/utils/__init__.py +1 -0
  28. unique_toolkit/_common/utils/files.py +43 -0
  29. unique_toolkit/_common/utils/structured_output/__init__.py +1 -0
  30. unique_toolkit/_common/utils/structured_output/schema.py +5 -0
  31. unique_toolkit/_common/utils/write_configuration.py +51 -0
  32. unique_toolkit/_common/validators.py +101 -4
  33. unique_toolkit/agentic/__init__.py +1 -0
  34. unique_toolkit/agentic/debug_info_manager/debug_info_manager.py +28 -0
  35. unique_toolkit/agentic/debug_info_manager/test/test_debug_info_manager.py +278 -0
  36. unique_toolkit/agentic/evaluation/config.py +36 -0
  37. unique_toolkit/{evaluators → agentic/evaluation}/context_relevancy/prompts.py +25 -0
  38. unique_toolkit/agentic/evaluation/context_relevancy/schema.py +80 -0
  39. unique_toolkit/agentic/evaluation/context_relevancy/service.py +273 -0
  40. unique_toolkit/agentic/evaluation/evaluation_manager.py +218 -0
  41. unique_toolkit/agentic/evaluation/hallucination/constants.py +61 -0
  42. unique_toolkit/agentic/evaluation/hallucination/hallucination_evaluation.py +111 -0
  43. unique_toolkit/{evaluators → agentic/evaluation}/hallucination/prompts.py +1 -1
  44. unique_toolkit/{evaluators → agentic/evaluation}/hallucination/service.py +16 -15
  45. unique_toolkit/{evaluators → agentic/evaluation}/hallucination/utils.py +30 -20
  46. unique_toolkit/{evaluators → agentic/evaluation}/output_parser.py +20 -2
  47. unique_toolkit/{evaluators → agentic/evaluation}/schemas.py +27 -7
  48. unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +253 -0
  49. unique_toolkit/agentic/evaluation/tests/test_output_parser.py +87 -0
  50. unique_toolkit/agentic/history_manager/history_construction_with_contents.py +297 -0
  51. unique_toolkit/agentic/history_manager/history_manager.py +242 -0
  52. unique_toolkit/agentic/history_manager/loop_token_reducer.py +484 -0
  53. unique_toolkit/agentic/history_manager/utils.py +96 -0
  54. unique_toolkit/agentic/postprocessor/postprocessor_manager.py +212 -0
  55. unique_toolkit/agentic/reference_manager/reference_manager.py +103 -0
  56. unique_toolkit/agentic/responses_api/__init__.py +19 -0
  57. unique_toolkit/agentic/responses_api/postprocessors/code_display.py +63 -0
  58. unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +145 -0
  59. unique_toolkit/agentic/responses_api/stream_handler.py +15 -0
  60. unique_toolkit/agentic/short_term_memory_manager/persistent_short_term_memory_manager.py +141 -0
  61. unique_toolkit/agentic/thinking_manager/thinking_manager.py +103 -0
  62. unique_toolkit/agentic/tools/__init__.py +1 -0
  63. unique_toolkit/agentic/tools/a2a/__init__.py +36 -0
  64. unique_toolkit/agentic/tools/a2a/config.py +17 -0
  65. unique_toolkit/agentic/tools/a2a/evaluation/__init__.py +15 -0
  66. unique_toolkit/agentic/tools/a2a/evaluation/_utils.py +66 -0
  67. unique_toolkit/agentic/tools/a2a/evaluation/config.py +55 -0
  68. unique_toolkit/agentic/tools/a2a/evaluation/evaluator.py +260 -0
  69. unique_toolkit/agentic/tools/a2a/evaluation/summarization_user_message.j2 +9 -0
  70. unique_toolkit/agentic/tools/a2a/manager.py +55 -0
  71. unique_toolkit/agentic/tools/a2a/postprocessing/__init__.py +21 -0
  72. unique_toolkit/agentic/tools/a2a/postprocessing/_display_utils.py +185 -0
  73. unique_toolkit/agentic/tools/a2a/postprocessing/_ref_utils.py +73 -0
  74. unique_toolkit/agentic/tools/a2a/postprocessing/config.py +45 -0
  75. unique_toolkit/agentic/tools/a2a/postprocessing/display.py +180 -0
  76. unique_toolkit/agentic/tools/a2a/postprocessing/references.py +101 -0
  77. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_display_utils.py +1335 -0
  78. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_ref_utils.py +603 -0
  79. unique_toolkit/agentic/tools/a2a/prompts.py +46 -0
  80. unique_toolkit/agentic/tools/a2a/response_watcher/__init__.py +6 -0
  81. unique_toolkit/agentic/tools/a2a/response_watcher/service.py +91 -0
  82. unique_toolkit/agentic/tools/a2a/tool/__init__.py +4 -0
  83. unique_toolkit/agentic/tools/a2a/tool/_memory.py +26 -0
  84. unique_toolkit/agentic/tools/a2a/tool/_schema.py +9 -0
  85. unique_toolkit/agentic/tools/a2a/tool/config.py +73 -0
  86. unique_toolkit/agentic/tools/a2a/tool/service.py +306 -0
  87. unique_toolkit/agentic/tools/agent_chunks_hanlder.py +65 -0
  88. unique_toolkit/agentic/tools/config.py +167 -0
  89. unique_toolkit/agentic/tools/factory.py +44 -0
  90. unique_toolkit/agentic/tools/mcp/__init__.py +4 -0
  91. unique_toolkit/agentic/tools/mcp/manager.py +71 -0
  92. unique_toolkit/agentic/tools/mcp/models.py +28 -0
  93. unique_toolkit/agentic/tools/mcp/tool_wrapper.py +234 -0
  94. unique_toolkit/agentic/tools/openai_builtin/__init__.py +11 -0
  95. unique_toolkit/agentic/tools/openai_builtin/base.py +30 -0
  96. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/__init__.py +8 -0
  97. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/config.py +57 -0
  98. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/service.py +230 -0
  99. unique_toolkit/agentic/tools/openai_builtin/manager.py +62 -0
  100. unique_toolkit/agentic/tools/schemas.py +141 -0
  101. unique_toolkit/agentic/tools/test/test_mcp_manager.py +536 -0
  102. unique_toolkit/agentic/tools/test/test_tool_progress_reporter.py +445 -0
  103. unique_toolkit/agentic/tools/tool.py +183 -0
  104. unique_toolkit/agentic/tools/tool_manager.py +523 -0
  105. unique_toolkit/agentic/tools/tool_progress_reporter.py +285 -0
  106. unique_toolkit/agentic/tools/utils/__init__.py +19 -0
  107. unique_toolkit/agentic/tools/utils/execution/__init__.py +1 -0
  108. unique_toolkit/agentic/tools/utils/execution/execution.py +286 -0
  109. unique_toolkit/agentic/tools/utils/source_handling/__init__.py +0 -0
  110. unique_toolkit/agentic/tools/utils/source_handling/schema.py +21 -0
  111. unique_toolkit/agentic/tools/utils/source_handling/source_formatting.py +207 -0
  112. unique_toolkit/agentic/tools/utils/source_handling/tests/test_source_formatting.py +216 -0
  113. unique_toolkit/app/__init__.py +6 -0
  114. unique_toolkit/app/dev_util.py +180 -0
  115. unique_toolkit/app/init_sdk.py +32 -1
  116. unique_toolkit/app/schemas.py +198 -31
  117. unique_toolkit/app/unique_settings.py +367 -0
  118. unique_toolkit/chat/__init__.py +8 -1
  119. unique_toolkit/chat/deprecated/service.py +232 -0
  120. unique_toolkit/chat/functions.py +642 -77
  121. unique_toolkit/chat/rendering.py +34 -0
  122. unique_toolkit/chat/responses_api.py +461 -0
  123. unique_toolkit/chat/schemas.py +133 -2
  124. unique_toolkit/chat/service.py +115 -767
  125. unique_toolkit/content/functions.py +153 -4
  126. unique_toolkit/content/schemas.py +122 -15
  127. unique_toolkit/content/service.py +278 -44
  128. unique_toolkit/content/smart_rules.py +301 -0
  129. unique_toolkit/content/utils.py +8 -3
  130. unique_toolkit/embedding/service.py +102 -11
  131. unique_toolkit/framework_utilities/__init__.py +1 -0
  132. unique_toolkit/framework_utilities/langchain/client.py +71 -0
  133. unique_toolkit/framework_utilities/langchain/history.py +19 -0
  134. unique_toolkit/framework_utilities/openai/__init__.py +6 -0
  135. unique_toolkit/framework_utilities/openai/client.py +83 -0
  136. unique_toolkit/framework_utilities/openai/message_builder.py +229 -0
  137. unique_toolkit/framework_utilities/utils.py +23 -0
  138. unique_toolkit/language_model/__init__.py +3 -0
  139. unique_toolkit/language_model/builder.py +27 -11
  140. unique_toolkit/language_model/default_language_model.py +3 -0
  141. unique_toolkit/language_model/functions.py +327 -43
  142. unique_toolkit/language_model/infos.py +992 -50
  143. unique_toolkit/language_model/reference.py +242 -0
  144. unique_toolkit/language_model/schemas.py +475 -48
  145. unique_toolkit/language_model/service.py +228 -27
  146. unique_toolkit/protocols/support.py +145 -0
  147. unique_toolkit/services/__init__.py +7 -0
  148. unique_toolkit/services/chat_service.py +1630 -0
  149. unique_toolkit/services/knowledge_base.py +861 -0
  150. unique_toolkit/short_term_memory/service.py +178 -41
  151. unique_toolkit/smart_rules/__init__.py +0 -0
  152. unique_toolkit/smart_rules/compile.py +56 -0
  153. unique_toolkit/test_utilities/events.py +197 -0
  154. {unique_toolkit-0.7.7.dist-info → unique_toolkit-1.23.0.dist-info}/METADATA +606 -7
  155. unique_toolkit-1.23.0.dist-info/RECORD +182 -0
  156. unique_toolkit/evaluators/__init__.py +0 -1
  157. unique_toolkit/evaluators/config.py +0 -35
  158. unique_toolkit/evaluators/constants.py +0 -1
  159. unique_toolkit/evaluators/context_relevancy/constants.py +0 -32
  160. unique_toolkit/evaluators/context_relevancy/service.py +0 -53
  161. unique_toolkit/evaluators/context_relevancy/utils.py +0 -142
  162. unique_toolkit/evaluators/hallucination/constants.py +0 -41
  163. unique_toolkit-0.7.7.dist-info/RECORD +0 -64
  164. /unique_toolkit/{evaluators → agentic/evaluation}/exception.py +0 -0
  165. {unique_toolkit-0.7.7.dist-info → unique_toolkit-1.23.0.dist-info}/LICENSE +0 -0
  166. {unique_toolkit-0.7.7.dist-info → unique_toolkit-1.23.0.dist-info}/WHEEL +0 -0
@@ -1,27 +1,28 @@
1
1
  import logging
2
2
 
3
- from unique_toolkit.app.schemas import Event
4
- from unique_toolkit.evaluators.config import (
5
- EvaluationMetricConfig,
6
- )
7
- from unique_toolkit.evaluators.hallucination.constants import (
8
- hallucination_metric_default_config,
9
- )
10
- from unique_toolkit.evaluators.hallucination.utils import check_hallucination_async
11
- from unique_toolkit.evaluators.schemas import (
3
+ from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
4
+ from unique_toolkit.agentic.evaluation.schemas import (
12
5
  EvaluationMetricInput,
13
6
  EvaluationMetricResult,
14
7
  )
8
+ from unique_toolkit.app.schemas import ChatEvent
15
9
 
16
- logger = logging.getLogger(__name__)
10
+ from .constants import hallucination_metric_default_config
11
+ from .utils import check_hallucination
12
+
13
+ SYSTEM_MSG_KEY = "systemPrompt"
14
+ USER_MSG_KEY = "userPrompt"
15
+ SYSTEM_MSG_DEFAULT_KEY = "systemPromptDefault"
16
+ USER_MSG_DEFAULT_KEY = "userPromptDefault"
17
17
 
18
18
 
19
19
  class HallucinationEvaluator:
20
- def __init__(self, event: Event, logger: logging.Logger = logger):
20
+ def __init__(self, event: ChatEvent):
21
21
  self.event = event
22
- self.logger = logger
23
22
 
24
- async def run(
23
+ self.logger = logging.getLogger(f"HallucinationEvaluator.{__name__}")
24
+
25
+ async def analyze(
25
26
  self,
26
27
  input: EvaluationMetricInput,
27
28
  config: EvaluationMetricConfig = hallucination_metric_default_config,
@@ -36,7 +37,7 @@ class HallucinationEvaluator:
36
37
  If no contexts or history are referenced in the generated output, the method verifies
37
38
  that the output does not contain any relevant information to answer the question.
38
39
 
39
- This method calls `check_hallucination_async` to perform the actual analysis. The `check_hallucination_async`
40
+ This method calls `check_hallucination` to perform the actual analysis. The `check_hallucination`
40
41
  function handles the evaluation using the company ID from the event, the provided input, and the configuration.
41
42
 
42
43
  Args:
@@ -53,6 +54,6 @@ class HallucinationEvaluator:
53
54
  self.logger.info("Hallucination metric is not enabled.")
54
55
  return None
55
56
 
56
- return await check_hallucination_async(
57
+ return await check_hallucination(
57
58
  company_id=self.event.company_id, input=input, config=config
58
59
  )
@@ -1,32 +1,30 @@
1
1
  import logging
2
2
  from string import Template
3
3
 
4
- from unique_toolkit.evaluators.config import (
5
- EvaluationMetricConfig,
6
- )
7
- from unique_toolkit.evaluators.exception import EvaluatorException
8
- from unique_toolkit.evaluators.hallucination.constants import (
9
- SYSTEM_MSG_DEFAULT_KEY,
10
- SYSTEM_MSG_KEY,
11
- USER_MSG_DEFAULT_KEY,
12
- USER_MSG_KEY,
13
- hallucination_required_input_fields,
14
- )
15
- from unique_toolkit.evaluators.output_parser import (
16
- parse_eval_metric_result,
17
- )
18
- from unique_toolkit.evaluators.schemas import (
4
+ from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
5
+ from unique_toolkit.agentic.evaluation.exception import EvaluatorException
6
+ from unique_toolkit.agentic.evaluation.output_parser import parse_eval_metric_result
7
+ from unique_toolkit.agentic.evaluation.schemas import (
19
8
  EvaluationMetricInput,
20
9
  EvaluationMetricName,
21
10
  EvaluationMetricResult,
22
11
  )
12
+ from unique_toolkit.content.schemas import ContentChunk
23
13
  from unique_toolkit.language_model.schemas import (
24
14
  LanguageModelMessages,
15
+ LanguageModelStreamResponse,
25
16
  LanguageModelSystemMessage,
26
17
  LanguageModelUserMessage,
27
18
  )
28
19
  from unique_toolkit.language_model.service import LanguageModelService
29
20
 
21
+ from .constants import (
22
+ SYSTEM_MSG_DEFAULT_KEY,
23
+ SYSTEM_MSG_KEY,
24
+ USER_MSG_DEFAULT_KEY,
25
+ USER_MSG_KEY,
26
+ hallucination_required_input_fields,
27
+ )
30
28
  from .prompts import (
31
29
  HALLUCINATION_METRIC_SYSTEM_MSG,
32
30
  HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
@@ -34,15 +32,12 @@ from .prompts import (
34
32
  HALLUCINATION_METRIC_USER_MSG_DEFAULT,
35
33
  )
36
34
 
37
- logger = logging.getLogger(__name__)
38
-
39
35
 
40
- async def check_hallucination_async(
36
+ async def check_hallucination(
41
37
  company_id: str,
42
38
  input: EvaluationMetricInput,
43
39
  config: EvaluationMetricConfig,
44
- logger: logging.Logger = logger,
45
- ) -> EvaluationMetricResult | None:
40
+ ) -> EvaluationMetricResult:
46
41
  """
47
42
  Analyzes the level of hallucination in the generated output by comparing it with the provided input
48
43
  and the contexts or history. The analysis classifies the hallucination level as:
@@ -73,6 +68,9 @@ async def check_hallucination_async(
73
68
  Raises:
74
69
  EvaluatorException: If the context texts are empty, required fields are missing, or an error occurs during the evaluation.
75
70
  """
71
+
72
+ logger = logging.getLogger(f"check_hallucination.{__name__}")
73
+
76
74
  model_name = config.language_model.name
77
75
  logger.info(f"Analyzing level of hallucination with {model_name}.")
78
76
 
@@ -200,3 +198,15 @@ def _get_user_prompt_default(config: EvaluationMetricConfig):
200
198
  USER_MSG_DEFAULT_KEY,
201
199
  HALLUCINATION_METRIC_USER_MSG_DEFAULT,
202
200
  )
201
+
202
+
203
+ def context_text_from_stream_response(
204
+ response: LanguageModelStreamResponse, selected_chunks: list[ContentChunk]
205
+ ):
206
+ response_references = response.message.references
207
+ reference_ids = [reference.source_id for reference in response_references]
208
+ filtered_contexts: list[str] = []
209
+ for chunk in selected_chunks:
210
+ if f"{chunk.id}_{chunk.chunk_id}" in reference_ids:
211
+ filtered_contexts.append(chunk.text)
212
+ return filtered_contexts
@@ -1,5 +1,8 @@
1
- from unique_toolkit.evaluators.exception import EvaluatorException
2
- from unique_toolkit.evaluators.schemas import (
1
+ from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
2
+ EvaluationSchemaStructuredOutput,
3
+ )
4
+ from unique_toolkit.agentic.evaluation.exception import EvaluatorException
5
+ from unique_toolkit.agentic.evaluation.schemas import (
3
6
  EvaluationMetricName,
4
7
  EvaluationMetricResult,
5
8
  )
@@ -28,3 +31,18 @@ def parse_eval_metric_result(
28
31
  value=parsed_result.get("value", "None"),
29
32
  reason=parsed_result.get("reason", "None"),
30
33
  )
34
+
35
+
36
+ def parse_eval_metric_result_structured_output(
37
+ result: EvaluationSchemaStructuredOutput,
38
+ metric_name: EvaluationMetricName,
39
+ ) -> EvaluationMetricResult:
40
+ """
41
+ Parses the evaluation metric result.
42
+ """
43
+ return EvaluationMetricResult(
44
+ name=metric_name,
45
+ value=result.value,
46
+ reason=result.reason,
47
+ fact_list=[item.fact for item in result.fact_list],
48
+ )
@@ -1,18 +1,24 @@
1
- from enum import Enum
1
+ from enum import StrEnum
2
2
  from typing import Optional
3
3
 
4
- from pydantic import BaseModel
4
+ from pydantic import BaseModel, ConfigDict, Field
5
5
 
6
+ from unique_toolkit.agentic.evaluation.exception import EvaluatorException
6
7
  from unique_toolkit.chat import ChatMessage
7
- from unique_toolkit.evaluators.exception import EvaluatorException
8
+ from unique_toolkit.chat.schemas import (
9
+ ChatMessageAssessmentLabel,
10
+ ChatMessageAssessmentStatus,
11
+ ChatMessageAssessmentType,
12
+ )
8
13
 
9
14
 
10
- class EvaluationMetricName(Enum):
15
+ class EvaluationMetricName(StrEnum):
11
16
  HALLUCINATION = "hallucination"
12
17
  CONTEXT_RELEVANCY = "relevancy"
18
+ SUB_AGENT = "sub_agent"
13
19
 
14
20
 
15
- class EvaluationMetricInputFieldName(str, Enum):
21
+ class EvaluationMetricInputFieldName(StrEnum):
16
22
  INPUT_TEXT = "input_text"
17
23
  CONTEXT_TEXTS = "context_texts"
18
24
  HISTORY_MESSAGES = "history_messages"
@@ -38,7 +44,7 @@ class EvaluationMetricInput(BaseModel):
38
44
 
39
45
  return "\n".join(
40
46
  [
41
- f"<{tag_name}-{index}>{text}</{tag_name}-{index}>"
47
+ f"<{tag_name}-{index + 1}>{text}</{tag_name}-{index + 1}>"
42
48
  for index, text in enumerate(self.context_texts)
43
49
  ]
44
50
  )
@@ -46,7 +52,7 @@ class EvaluationMetricInput(BaseModel):
46
52
  def get_history_message_text(self, chat_message: ChatMessage):
47
53
  return f"{chat_message.role.value}: {chat_message.content}"
48
54
 
49
- def get_history_message_texts(self):
55
+ def get_history_message_texts(self) -> list[str]:
50
56
  if not self.history_messages:
51
57
  return []
52
58
  return [self.get_history_message_text(msg) for msg in self.history_messages]
@@ -77,6 +83,20 @@ class EvaluationMetricInput(BaseModel):
77
83
 
78
84
 
79
85
  class EvaluationMetricResult(BaseModel):
86
+ model_config = ConfigDict(arbitrary_types_allowed=True)
87
+
80
88
  name: EvaluationMetricName
81
89
  value: str
82
90
  reason: str
91
+ is_positive: Optional[bool] = None
92
+ user_info: Optional[str] = None
93
+ error: Exception | None = None
94
+ fact_list: list[str] = Field(default_factory=list[str])
95
+
96
+
97
+ class EvaluationAssessmentMessage(BaseModel):
98
+ status: ChatMessageAssessmentStatus
99
+ explanation: str
100
+ title: str
101
+ label: ChatMessageAssessmentLabel
102
+ type: ChatMessageAssessmentType
@@ -0,0 +1,253 @@
1
+ from unittest.mock import MagicMock, patch
2
+
3
+ import pytest
4
+
5
+ from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
6
+ from unique_toolkit.agentic.evaluation.context_relevancy.prompts import (
7
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
8
+ )
9
+ from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
10
+ EvaluationSchemaStructuredOutput,
11
+ )
12
+ from unique_toolkit.agentic.evaluation.context_relevancy.service import (
13
+ ContextRelevancyEvaluator,
14
+ )
15
+ from unique_toolkit.agentic.evaluation.exception import EvaluatorException
16
+ from unique_toolkit.agentic.evaluation.schemas import (
17
+ EvaluationMetricInput,
18
+ EvaluationMetricName,
19
+ EvaluationMetricResult,
20
+ )
21
+ from unique_toolkit.app.schemas import ChatEvent
22
+ from unique_toolkit.chat.service import LanguageModelName
23
+ from unique_toolkit.language_model.infos import (
24
+ LanguageModelInfo,
25
+ )
26
+ from unique_toolkit.language_model.schemas import (
27
+ LanguageModelAssistantMessage,
28
+ LanguageModelCompletionChoice,
29
+ LanguageModelMessages,
30
+ )
31
+ from unique_toolkit.language_model.service import LanguageModelResponse
32
+
33
+
34
+ @pytest.fixture
35
+ def event():
36
+ event = MagicMock(spec=ChatEvent)
37
+ event.payload = MagicMock()
38
+ event.payload.user_message = MagicMock()
39
+ event.payload.user_message.text = "Test query"
40
+ event.user_id = "user_0"
41
+ event.company_id = "company_0"
42
+ return event
43
+
44
+
45
+ @pytest.fixture
46
+ def evaluator(event):
47
+ return ContextRelevancyEvaluator(event)
48
+
49
+
50
+ @pytest.fixture
51
+ def basic_config():
52
+ return EvaluationMetricConfig(
53
+ enabled=True,
54
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
55
+ language_model=LanguageModelInfo.from_name(
56
+ LanguageModelName.AZURE_GPT_4o_2024_0806
57
+ ),
58
+ )
59
+
60
+
61
+ @pytest.fixture
62
+ def structured_config(basic_config):
63
+ model_info = LanguageModelInfo.from_name(LanguageModelName.AZURE_GPT_4o_2024_0806)
64
+ return EvaluationMetricConfig(
65
+ enabled=True,
66
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
67
+ language_model=model_info,
68
+ )
69
+
70
+
71
+ @pytest.fixture
72
+ def sample_input():
73
+ return EvaluationMetricInput(
74
+ input_text="test query",
75
+ context_texts=["test context 1", "test context 2"],
76
+ )
77
+
78
+
79
+ @pytest.mark.asyncio
80
+ async def test_analyze_disabled(evaluator, sample_input, basic_config):
81
+ basic_config.enabled = False
82
+ result = await evaluator.analyze(sample_input, basic_config)
83
+ assert result is None
84
+
85
+
86
+ @pytest.mark.asyncio
87
+ async def test_analyze_empty_context(evaluator, basic_config):
88
+ input_with_empty_context = EvaluationMetricInput(
89
+ input_text="test query", context_texts=[]
90
+ )
91
+
92
+ with pytest.raises(EvaluatorException) as exc_info:
93
+ await evaluator.analyze(input_with_empty_context, basic_config)
94
+
95
+ assert "No context texts provided." in str(exc_info.value)
96
+
97
+
98
+ @pytest.mark.asyncio
99
+ async def test_analyze_regular_output(evaluator, sample_input, basic_config):
100
+ mock_result = LanguageModelResponse(
101
+ choices=[
102
+ LanguageModelCompletionChoice(
103
+ index=0,
104
+ message=LanguageModelAssistantMessage(
105
+ content="""{
106
+ "value": "high",
107
+ "reason": "Test reason"
108
+ }"""
109
+ ),
110
+ finish_reason="stop",
111
+ )
112
+ ]
113
+ )
114
+
115
+ with patch.object(
116
+ evaluator.language_model_service,
117
+ "complete_async",
118
+ return_value=mock_result,
119
+ ) as mock_complete:
120
+ result = await evaluator.analyze(sample_input, basic_config)
121
+
122
+ assert isinstance(result, EvaluationMetricResult)
123
+ assert result.value.lower() == "high"
124
+ mock_complete.assert_called_once()
125
+
126
+
127
+ @pytest.mark.asyncio
128
+ async def test_analyze_structured_output(evaluator, sample_input, structured_config):
129
+ mock_result = LanguageModelResponse(
130
+ choices=[
131
+ LanguageModelCompletionChoice(
132
+ index=0,
133
+ message=LanguageModelAssistantMessage(
134
+ content="HIGH",
135
+ parsed={"value": "high", "reason": "Test reason"},
136
+ ),
137
+ finish_reason="stop",
138
+ )
139
+ ]
140
+ )
141
+
142
+ structured_output_schema = EvaluationSchemaStructuredOutput
143
+
144
+ with patch.object(
145
+ evaluator.language_model_service,
146
+ "complete_async",
147
+ return_value=mock_result,
148
+ ) as mock_complete:
149
+ result = await evaluator.analyze(
150
+ sample_input, structured_config, structured_output_schema
151
+ )
152
+ assert isinstance(result, EvaluationMetricResult)
153
+ assert result.value.lower() == "high"
154
+ mock_complete.assert_called_once()
155
+
156
+
157
+ @pytest.mark.asyncio
158
+ async def test_analyze_structured_output_validation_error(
159
+ evaluator, sample_input, structured_config
160
+ ):
161
+ mock_result = LanguageModelResponse(
162
+ choices=[
163
+ LanguageModelCompletionChoice(
164
+ index=0,
165
+ message=LanguageModelAssistantMessage(
166
+ content="HIGH", parsed={"invalid": "data"}
167
+ ),
168
+ finish_reason="stop",
169
+ )
170
+ ]
171
+ )
172
+
173
+ structured_output_schema = EvaluationSchemaStructuredOutput
174
+
175
+ with patch.object(
176
+ evaluator.language_model_service,
177
+ "complete_async",
178
+ return_value=mock_result,
179
+ ):
180
+ with pytest.raises(EvaluatorException) as exc_info:
181
+ await evaluator.analyze(
182
+ sample_input, structured_config, structured_output_schema
183
+ )
184
+ assert "Error occurred during structured output validation" in str(
185
+ exc_info.value
186
+ )
187
+
188
+
189
+ @pytest.mark.asyncio
190
+ async def test_analyze_regular_output_empty_response(
191
+ evaluator, sample_input, basic_config
192
+ ):
193
+ mock_result = LanguageModelResponse(
194
+ choices=[
195
+ LanguageModelCompletionChoice(
196
+ index=0,
197
+ message=LanguageModelAssistantMessage(content=""),
198
+ finish_reason="stop",
199
+ )
200
+ ]
201
+ )
202
+
203
+ with patch.object(
204
+ evaluator.language_model_service,
205
+ "complete_async",
206
+ return_value=mock_result,
207
+ ):
208
+ with pytest.raises(EvaluatorException) as exc_info:
209
+ await evaluator.analyze(sample_input, basic_config)
210
+ assert "did not return a result" in str(exc_info.value)
211
+
212
+
213
+ def test_compose_msgs_regular(evaluator, sample_input, basic_config):
214
+ messages = evaluator._compose_msgs(
215
+ sample_input, basic_config, enable_structured_output=False
216
+ )
217
+
218
+ assert isinstance(messages, LanguageModelMessages)
219
+ assert messages.root[0].content == CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG
220
+ assert isinstance(messages.root[1].content, str)
221
+ assert "test query" in messages.root[1].content
222
+ assert "test context 1" in messages.root[1].content
223
+ assert "test context 2" in messages.root[1].content
224
+
225
+
226
+ def test_compose_msgs_structured(evaluator, sample_input, structured_config):
227
+ messages = evaluator._compose_msgs(
228
+ sample_input, structured_config, enable_structured_output=True
229
+ )
230
+
231
+ assert isinstance(messages, LanguageModelMessages)
232
+ assert len(messages.root) == 2
233
+ assert (
234
+ messages.root[0].content != CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG
235
+ ) # Should use structured output prompt
236
+ assert isinstance(messages.root[1].content, str)
237
+ assert "test query" in messages.root[1].content
238
+ assert "test context 1" in messages.root[1].content
239
+ assert "test context 2" in messages.root[1].content
240
+
241
+
242
+ @pytest.mark.asyncio
243
+ async def test_analyze_unknown_error(evaluator, sample_input, basic_config):
244
+ with patch.object(
245
+ evaluator.language_model_service,
246
+ "complete_async",
247
+ side_effect=Exception("Unknown error"),
248
+ ):
249
+ with pytest.raises(EvaluatorException) as exc_info:
250
+ await evaluator.analyze(sample_input, basic_config)
251
+ assert "Unknown error occurred during context relevancy metric analysis" in str(
252
+ exc_info.value
253
+ )
@@ -0,0 +1,87 @@
1
+ import pytest
2
+
3
+ from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
4
+ EvaluationSchemaStructuredOutput,
5
+ Fact,
6
+ )
7
+ from unique_toolkit.agentic.evaluation.exception import EvaluatorException
8
+ from unique_toolkit.agentic.evaluation.output_parser import (
9
+ parse_eval_metric_result,
10
+ parse_eval_metric_result_structured_output,
11
+ )
12
+ from unique_toolkit.agentic.evaluation.schemas import (
13
+ EvaluationMetricName,
14
+ EvaluationMetricResult,
15
+ )
16
+
17
+
18
+ def test_parse_eval_metric_result_success():
19
+ # Test successful parsing with all fields
20
+ result = '{"value": "high", "reason": "Test reason"}'
21
+ parsed = parse_eval_metric_result(result, EvaluationMetricName.CONTEXT_RELEVANCY)
22
+
23
+ assert isinstance(parsed, EvaluationMetricResult)
24
+ assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
25
+ assert parsed.value == "high"
26
+ assert parsed.reason == "Test reason"
27
+ assert parsed.fact_list == []
28
+
29
+
30
+ def test_parse_eval_metric_result_missing_fields():
31
+ # Test parsing with missing fields (should use default "None")
32
+ result = '{"value": "high"}'
33
+ parsed = parse_eval_metric_result(result, EvaluationMetricName.CONTEXT_RELEVANCY)
34
+
35
+ assert isinstance(parsed, EvaluationMetricResult)
36
+ assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
37
+ assert parsed.value == "high"
38
+ assert parsed.reason == "None"
39
+ assert parsed.fact_list == []
40
+
41
+
42
+ def test_parse_eval_metric_result_invalid_json():
43
+ # Test parsing with invalid JSON
44
+ result = "invalid json"
45
+ with pytest.raises(EvaluatorException) as exc_info:
46
+ parse_eval_metric_result(result, EvaluationMetricName.CONTEXT_RELEVANCY)
47
+
48
+ assert "Error occurred during parsing the evaluation metric result" in str(
49
+ exc_info.value
50
+ )
51
+
52
+
53
+ def test_parse_eval_metric_result_structured_output_basic():
54
+ # Test basic structured output without fact list
55
+ result = EvaluationSchemaStructuredOutput(value="high", reason="Test reason")
56
+ parsed = parse_eval_metric_result_structured_output(
57
+ result, EvaluationMetricName.CONTEXT_RELEVANCY
58
+ )
59
+
60
+ assert isinstance(parsed, EvaluationMetricResult)
61
+ assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
62
+ assert parsed.value == "high"
63
+ assert parsed.reason == "Test reason"
64
+ assert parsed.fact_list == []
65
+
66
+
67
+ def test_parse_eval_metric_result_structured_output_with_facts():
68
+ # Test structured output with fact list
69
+ result = EvaluationSchemaStructuredOutput(
70
+ value="high",
71
+ reason="Test reason",
72
+ fact_list=[
73
+ Fact(fact="Fact 1"),
74
+ Fact(fact="Fact 2"),
75
+ ],
76
+ )
77
+ parsed = parse_eval_metric_result_structured_output(
78
+ result, EvaluationMetricName.CONTEXT_RELEVANCY
79
+ )
80
+
81
+ assert isinstance(parsed, EvaluationMetricResult)
82
+ assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
83
+ assert parsed.value == "high"
84
+ assert parsed.reason == "Test reason"
85
+ assert parsed.fact_list == ["Fact 1", "Fact 2"]
86
+ assert isinstance(parsed.fact_list, list)
87
+ assert len(parsed.fact_list) == 2 # None fact should be filtered out