unique_toolkit 0.7.7__py3-none-any.whl → 1.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unique_toolkit might be problematic. Click here for more details.
- unique_toolkit/__init__.py +28 -1
- unique_toolkit/_common/api_calling/human_verification_manager.py +343 -0
- unique_toolkit/_common/base_model_type_attribute.py +303 -0
- unique_toolkit/_common/chunk_relevancy_sorter/config.py +49 -0
- unique_toolkit/_common/chunk_relevancy_sorter/exception.py +5 -0
- unique_toolkit/_common/chunk_relevancy_sorter/schemas.py +46 -0
- unique_toolkit/_common/chunk_relevancy_sorter/service.py +374 -0
- unique_toolkit/_common/chunk_relevancy_sorter/tests/test_service.py +275 -0
- unique_toolkit/_common/default_language_model.py +12 -0
- unique_toolkit/_common/docx_generator/__init__.py +7 -0
- unique_toolkit/_common/docx_generator/config.py +12 -0
- unique_toolkit/_common/docx_generator/schemas.py +80 -0
- unique_toolkit/_common/docx_generator/service.py +252 -0
- unique_toolkit/_common/docx_generator/template/Doc Template.docx +0 -0
- unique_toolkit/_common/endpoint_builder.py +305 -0
- unique_toolkit/_common/endpoint_requestor.py +430 -0
- unique_toolkit/_common/exception.py +24 -0
- unique_toolkit/_common/feature_flags/schema.py +9 -0
- unique_toolkit/_common/pydantic/rjsf_tags.py +936 -0
- unique_toolkit/_common/pydantic_helpers.py +154 -0
- unique_toolkit/_common/referencing.py +53 -0
- unique_toolkit/_common/string_utilities.py +140 -0
- unique_toolkit/_common/tests/test_referencing.py +521 -0
- unique_toolkit/_common/tests/test_string_utilities.py +506 -0
- unique_toolkit/_common/token/image_token_counting.py +67 -0
- unique_toolkit/_common/token/token_counting.py +204 -0
- unique_toolkit/_common/utils/__init__.py +1 -0
- unique_toolkit/_common/utils/files.py +43 -0
- unique_toolkit/_common/utils/structured_output/__init__.py +1 -0
- unique_toolkit/_common/utils/structured_output/schema.py +5 -0
- unique_toolkit/_common/utils/write_configuration.py +51 -0
- unique_toolkit/_common/validators.py +101 -4
- unique_toolkit/agentic/__init__.py +1 -0
- unique_toolkit/agentic/debug_info_manager/debug_info_manager.py +28 -0
- unique_toolkit/agentic/debug_info_manager/test/test_debug_info_manager.py +278 -0
- unique_toolkit/agentic/evaluation/config.py +36 -0
- unique_toolkit/{evaluators → agentic/evaluation}/context_relevancy/prompts.py +25 -0
- unique_toolkit/agentic/evaluation/context_relevancy/schema.py +80 -0
- unique_toolkit/agentic/evaluation/context_relevancy/service.py +273 -0
- unique_toolkit/agentic/evaluation/evaluation_manager.py +218 -0
- unique_toolkit/agentic/evaluation/hallucination/constants.py +61 -0
- unique_toolkit/agentic/evaluation/hallucination/hallucination_evaluation.py +111 -0
- unique_toolkit/{evaluators → agentic/evaluation}/hallucination/prompts.py +1 -1
- unique_toolkit/{evaluators → agentic/evaluation}/hallucination/service.py +16 -15
- unique_toolkit/{evaluators → agentic/evaluation}/hallucination/utils.py +30 -20
- unique_toolkit/{evaluators → agentic/evaluation}/output_parser.py +20 -2
- unique_toolkit/{evaluators → agentic/evaluation}/schemas.py +27 -7
- unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +253 -0
- unique_toolkit/agentic/evaluation/tests/test_output_parser.py +87 -0
- unique_toolkit/agentic/history_manager/history_construction_with_contents.py +297 -0
- unique_toolkit/agentic/history_manager/history_manager.py +242 -0
- unique_toolkit/agentic/history_manager/loop_token_reducer.py +484 -0
- unique_toolkit/agentic/history_manager/utils.py +96 -0
- unique_toolkit/agentic/postprocessor/postprocessor_manager.py +212 -0
- unique_toolkit/agentic/reference_manager/reference_manager.py +103 -0
- unique_toolkit/agentic/responses_api/__init__.py +19 -0
- unique_toolkit/agentic/responses_api/postprocessors/code_display.py +63 -0
- unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +145 -0
- unique_toolkit/agentic/responses_api/stream_handler.py +15 -0
- unique_toolkit/agentic/short_term_memory_manager/persistent_short_term_memory_manager.py +141 -0
- unique_toolkit/agentic/thinking_manager/thinking_manager.py +103 -0
- unique_toolkit/agentic/tools/__init__.py +1 -0
- unique_toolkit/agentic/tools/a2a/__init__.py +36 -0
- unique_toolkit/agentic/tools/a2a/config.py +17 -0
- unique_toolkit/agentic/tools/a2a/evaluation/__init__.py +15 -0
- unique_toolkit/agentic/tools/a2a/evaluation/_utils.py +66 -0
- unique_toolkit/agentic/tools/a2a/evaluation/config.py +55 -0
- unique_toolkit/agentic/tools/a2a/evaluation/evaluator.py +260 -0
- unique_toolkit/agentic/tools/a2a/evaluation/summarization_user_message.j2 +9 -0
- unique_toolkit/agentic/tools/a2a/manager.py +55 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/__init__.py +21 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/_display_utils.py +185 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/_ref_utils.py +73 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/config.py +45 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/display.py +180 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/references.py +101 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/test/test_display_utils.py +1335 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/test/test_ref_utils.py +603 -0
- unique_toolkit/agentic/tools/a2a/prompts.py +46 -0
- unique_toolkit/agentic/tools/a2a/response_watcher/__init__.py +6 -0
- unique_toolkit/agentic/tools/a2a/response_watcher/service.py +91 -0
- unique_toolkit/agentic/tools/a2a/tool/__init__.py +4 -0
- unique_toolkit/agentic/tools/a2a/tool/_memory.py +26 -0
- unique_toolkit/agentic/tools/a2a/tool/_schema.py +9 -0
- unique_toolkit/agentic/tools/a2a/tool/config.py +73 -0
- unique_toolkit/agentic/tools/a2a/tool/service.py +306 -0
- unique_toolkit/agentic/tools/agent_chunks_hanlder.py +65 -0
- unique_toolkit/agentic/tools/config.py +167 -0
- unique_toolkit/agentic/tools/factory.py +44 -0
- unique_toolkit/agentic/tools/mcp/__init__.py +4 -0
- unique_toolkit/agentic/tools/mcp/manager.py +71 -0
- unique_toolkit/agentic/tools/mcp/models.py +28 -0
- unique_toolkit/agentic/tools/mcp/tool_wrapper.py +234 -0
- unique_toolkit/agentic/tools/openai_builtin/__init__.py +11 -0
- unique_toolkit/agentic/tools/openai_builtin/base.py +30 -0
- unique_toolkit/agentic/tools/openai_builtin/code_interpreter/__init__.py +8 -0
- unique_toolkit/agentic/tools/openai_builtin/code_interpreter/config.py +57 -0
- unique_toolkit/agentic/tools/openai_builtin/code_interpreter/service.py +230 -0
- unique_toolkit/agentic/tools/openai_builtin/manager.py +62 -0
- unique_toolkit/agentic/tools/schemas.py +141 -0
- unique_toolkit/agentic/tools/test/test_mcp_manager.py +536 -0
- unique_toolkit/agentic/tools/test/test_tool_progress_reporter.py +445 -0
- unique_toolkit/agentic/tools/tool.py +183 -0
- unique_toolkit/agentic/tools/tool_manager.py +523 -0
- unique_toolkit/agentic/tools/tool_progress_reporter.py +285 -0
- unique_toolkit/agentic/tools/utils/__init__.py +19 -0
- unique_toolkit/agentic/tools/utils/execution/__init__.py +1 -0
- unique_toolkit/agentic/tools/utils/execution/execution.py +286 -0
- unique_toolkit/agentic/tools/utils/source_handling/__init__.py +0 -0
- unique_toolkit/agentic/tools/utils/source_handling/schema.py +21 -0
- unique_toolkit/agentic/tools/utils/source_handling/source_formatting.py +207 -0
- unique_toolkit/agentic/tools/utils/source_handling/tests/test_source_formatting.py +216 -0
- unique_toolkit/app/__init__.py +6 -0
- unique_toolkit/app/dev_util.py +180 -0
- unique_toolkit/app/init_sdk.py +32 -1
- unique_toolkit/app/schemas.py +198 -31
- unique_toolkit/app/unique_settings.py +367 -0
- unique_toolkit/chat/__init__.py +8 -1
- unique_toolkit/chat/deprecated/service.py +232 -0
- unique_toolkit/chat/functions.py +642 -77
- unique_toolkit/chat/rendering.py +34 -0
- unique_toolkit/chat/responses_api.py +461 -0
- unique_toolkit/chat/schemas.py +133 -2
- unique_toolkit/chat/service.py +115 -767
- unique_toolkit/content/functions.py +153 -4
- unique_toolkit/content/schemas.py +122 -15
- unique_toolkit/content/service.py +278 -44
- unique_toolkit/content/smart_rules.py +301 -0
- unique_toolkit/content/utils.py +8 -3
- unique_toolkit/embedding/service.py +102 -11
- unique_toolkit/framework_utilities/__init__.py +1 -0
- unique_toolkit/framework_utilities/langchain/client.py +71 -0
- unique_toolkit/framework_utilities/langchain/history.py +19 -0
- unique_toolkit/framework_utilities/openai/__init__.py +6 -0
- unique_toolkit/framework_utilities/openai/client.py +83 -0
- unique_toolkit/framework_utilities/openai/message_builder.py +229 -0
- unique_toolkit/framework_utilities/utils.py +23 -0
- unique_toolkit/language_model/__init__.py +3 -0
- unique_toolkit/language_model/builder.py +27 -11
- unique_toolkit/language_model/default_language_model.py +3 -0
- unique_toolkit/language_model/functions.py +327 -43
- unique_toolkit/language_model/infos.py +992 -50
- unique_toolkit/language_model/reference.py +242 -0
- unique_toolkit/language_model/schemas.py +475 -48
- unique_toolkit/language_model/service.py +228 -27
- unique_toolkit/protocols/support.py +145 -0
- unique_toolkit/services/__init__.py +7 -0
- unique_toolkit/services/chat_service.py +1630 -0
- unique_toolkit/services/knowledge_base.py +861 -0
- unique_toolkit/short_term_memory/service.py +178 -41
- unique_toolkit/smart_rules/__init__.py +0 -0
- unique_toolkit/smart_rules/compile.py +56 -0
- unique_toolkit/test_utilities/events.py +197 -0
- {unique_toolkit-0.7.7.dist-info → unique_toolkit-1.23.0.dist-info}/METADATA +606 -7
- unique_toolkit-1.23.0.dist-info/RECORD +182 -0
- unique_toolkit/evaluators/__init__.py +0 -1
- unique_toolkit/evaluators/config.py +0 -35
- unique_toolkit/evaluators/constants.py +0 -1
- unique_toolkit/evaluators/context_relevancy/constants.py +0 -32
- unique_toolkit/evaluators/context_relevancy/service.py +0 -53
- unique_toolkit/evaluators/context_relevancy/utils.py +0 -142
- unique_toolkit/evaluators/hallucination/constants.py +0 -41
- unique_toolkit-0.7.7.dist-info/RECORD +0 -64
- /unique_toolkit/{evaluators → agentic/evaluation}/exception.py +0 -0
- {unique_toolkit-0.7.7.dist-info → unique_toolkit-1.23.0.dist-info}/LICENSE +0 -0
- {unique_toolkit-0.7.7.dist-info → unique_toolkit-1.23.0.dist-info}/WHEEL +0 -0
|
@@ -1,27 +1,28 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
-
from unique_toolkit.
|
|
4
|
-
from unique_toolkit.
|
|
5
|
-
EvaluationMetricConfig,
|
|
6
|
-
)
|
|
7
|
-
from unique_toolkit.evaluators.hallucination.constants import (
|
|
8
|
-
hallucination_metric_default_config,
|
|
9
|
-
)
|
|
10
|
-
from unique_toolkit.evaluators.hallucination.utils import check_hallucination_async
|
|
11
|
-
from unique_toolkit.evaluators.schemas import (
|
|
3
|
+
from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
|
|
4
|
+
from unique_toolkit.agentic.evaluation.schemas import (
|
|
12
5
|
EvaluationMetricInput,
|
|
13
6
|
EvaluationMetricResult,
|
|
14
7
|
)
|
|
8
|
+
from unique_toolkit.app.schemas import ChatEvent
|
|
15
9
|
|
|
16
|
-
|
|
10
|
+
from .constants import hallucination_metric_default_config
|
|
11
|
+
from .utils import check_hallucination
|
|
12
|
+
|
|
13
|
+
SYSTEM_MSG_KEY = "systemPrompt"
|
|
14
|
+
USER_MSG_KEY = "userPrompt"
|
|
15
|
+
SYSTEM_MSG_DEFAULT_KEY = "systemPromptDefault"
|
|
16
|
+
USER_MSG_DEFAULT_KEY = "userPromptDefault"
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class HallucinationEvaluator:
|
|
20
|
-
def __init__(self, event:
|
|
20
|
+
def __init__(self, event: ChatEvent):
|
|
21
21
|
self.event = event
|
|
22
|
-
self.logger = logger
|
|
23
22
|
|
|
24
|
-
|
|
23
|
+
self.logger = logging.getLogger(f"HallucinationEvaluator.{__name__}")
|
|
24
|
+
|
|
25
|
+
async def analyze(
|
|
25
26
|
self,
|
|
26
27
|
input: EvaluationMetricInput,
|
|
27
28
|
config: EvaluationMetricConfig = hallucination_metric_default_config,
|
|
@@ -36,7 +37,7 @@ class HallucinationEvaluator:
|
|
|
36
37
|
If no contexts or history are referenced in the generated output, the method verifies
|
|
37
38
|
that the output does not contain any relevant information to answer the question.
|
|
38
39
|
|
|
39
|
-
This method calls `
|
|
40
|
+
This method calls `check_hallucination` to perform the actual analysis. The `check_hallucination`
|
|
40
41
|
function handles the evaluation using the company ID from the event, the provided input, and the configuration.
|
|
41
42
|
|
|
42
43
|
Args:
|
|
@@ -53,6 +54,6 @@ class HallucinationEvaluator:
|
|
|
53
54
|
self.logger.info("Hallucination metric is not enabled.")
|
|
54
55
|
return None
|
|
55
56
|
|
|
56
|
-
return await
|
|
57
|
+
return await check_hallucination(
|
|
57
58
|
company_id=self.event.company_id, input=input, config=config
|
|
58
59
|
)
|
|
@@ -1,32 +1,30 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from string import Template
|
|
3
3
|
|
|
4
|
-
from unique_toolkit.
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
from unique_toolkit.
|
|
8
|
-
from unique_toolkit.evaluators.hallucination.constants import (
|
|
9
|
-
SYSTEM_MSG_DEFAULT_KEY,
|
|
10
|
-
SYSTEM_MSG_KEY,
|
|
11
|
-
USER_MSG_DEFAULT_KEY,
|
|
12
|
-
USER_MSG_KEY,
|
|
13
|
-
hallucination_required_input_fields,
|
|
14
|
-
)
|
|
15
|
-
from unique_toolkit.evaluators.output_parser import (
|
|
16
|
-
parse_eval_metric_result,
|
|
17
|
-
)
|
|
18
|
-
from unique_toolkit.evaluators.schemas import (
|
|
4
|
+
from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
|
|
5
|
+
from unique_toolkit.agentic.evaluation.exception import EvaluatorException
|
|
6
|
+
from unique_toolkit.agentic.evaluation.output_parser import parse_eval_metric_result
|
|
7
|
+
from unique_toolkit.agentic.evaluation.schemas import (
|
|
19
8
|
EvaluationMetricInput,
|
|
20
9
|
EvaluationMetricName,
|
|
21
10
|
EvaluationMetricResult,
|
|
22
11
|
)
|
|
12
|
+
from unique_toolkit.content.schemas import ContentChunk
|
|
23
13
|
from unique_toolkit.language_model.schemas import (
|
|
24
14
|
LanguageModelMessages,
|
|
15
|
+
LanguageModelStreamResponse,
|
|
25
16
|
LanguageModelSystemMessage,
|
|
26
17
|
LanguageModelUserMessage,
|
|
27
18
|
)
|
|
28
19
|
from unique_toolkit.language_model.service import LanguageModelService
|
|
29
20
|
|
|
21
|
+
from .constants import (
|
|
22
|
+
SYSTEM_MSG_DEFAULT_KEY,
|
|
23
|
+
SYSTEM_MSG_KEY,
|
|
24
|
+
USER_MSG_DEFAULT_KEY,
|
|
25
|
+
USER_MSG_KEY,
|
|
26
|
+
hallucination_required_input_fields,
|
|
27
|
+
)
|
|
30
28
|
from .prompts import (
|
|
31
29
|
HALLUCINATION_METRIC_SYSTEM_MSG,
|
|
32
30
|
HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
|
|
@@ -34,15 +32,12 @@ from .prompts import (
|
|
|
34
32
|
HALLUCINATION_METRIC_USER_MSG_DEFAULT,
|
|
35
33
|
)
|
|
36
34
|
|
|
37
|
-
logger = logging.getLogger(__name__)
|
|
38
|
-
|
|
39
35
|
|
|
40
|
-
async def
|
|
36
|
+
async def check_hallucination(
|
|
41
37
|
company_id: str,
|
|
42
38
|
input: EvaluationMetricInput,
|
|
43
39
|
config: EvaluationMetricConfig,
|
|
44
|
-
|
|
45
|
-
) -> EvaluationMetricResult | None:
|
|
40
|
+
) -> EvaluationMetricResult:
|
|
46
41
|
"""
|
|
47
42
|
Analyzes the level of hallucination in the generated output by comparing it with the provided input
|
|
48
43
|
and the contexts or history. The analysis classifies the hallucination level as:
|
|
@@ -73,6 +68,9 @@ async def check_hallucination_async(
|
|
|
73
68
|
Raises:
|
|
74
69
|
EvaluatorException: If the context texts are empty, required fields are missing, or an error occurs during the evaluation.
|
|
75
70
|
"""
|
|
71
|
+
|
|
72
|
+
logger = logging.getLogger(f"check_hallucination.{__name__}")
|
|
73
|
+
|
|
76
74
|
model_name = config.language_model.name
|
|
77
75
|
logger.info(f"Analyzing level of hallucination with {model_name}.")
|
|
78
76
|
|
|
@@ -200,3 +198,15 @@ def _get_user_prompt_default(config: EvaluationMetricConfig):
|
|
|
200
198
|
USER_MSG_DEFAULT_KEY,
|
|
201
199
|
HALLUCINATION_METRIC_USER_MSG_DEFAULT,
|
|
202
200
|
)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def context_text_from_stream_response(
|
|
204
|
+
response: LanguageModelStreamResponse, selected_chunks: list[ContentChunk]
|
|
205
|
+
):
|
|
206
|
+
response_references = response.message.references
|
|
207
|
+
reference_ids = [reference.source_id for reference in response_references]
|
|
208
|
+
filtered_contexts: list[str] = []
|
|
209
|
+
for chunk in selected_chunks:
|
|
210
|
+
if f"{chunk.id}_{chunk.chunk_id}" in reference_ids:
|
|
211
|
+
filtered_contexts.append(chunk.text)
|
|
212
|
+
return filtered_contexts
|
|
@@ -1,5 +1,8 @@
|
|
|
1
|
-
from unique_toolkit.
|
|
2
|
-
|
|
1
|
+
from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
|
|
2
|
+
EvaluationSchemaStructuredOutput,
|
|
3
|
+
)
|
|
4
|
+
from unique_toolkit.agentic.evaluation.exception import EvaluatorException
|
|
5
|
+
from unique_toolkit.agentic.evaluation.schemas import (
|
|
3
6
|
EvaluationMetricName,
|
|
4
7
|
EvaluationMetricResult,
|
|
5
8
|
)
|
|
@@ -28,3 +31,18 @@ def parse_eval_metric_result(
|
|
|
28
31
|
value=parsed_result.get("value", "None"),
|
|
29
32
|
reason=parsed_result.get("reason", "None"),
|
|
30
33
|
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def parse_eval_metric_result_structured_output(
|
|
37
|
+
result: EvaluationSchemaStructuredOutput,
|
|
38
|
+
metric_name: EvaluationMetricName,
|
|
39
|
+
) -> EvaluationMetricResult:
|
|
40
|
+
"""
|
|
41
|
+
Parses the evaluation metric result.
|
|
42
|
+
"""
|
|
43
|
+
return EvaluationMetricResult(
|
|
44
|
+
name=metric_name,
|
|
45
|
+
value=result.value,
|
|
46
|
+
reason=result.reason,
|
|
47
|
+
fact_list=[item.fact for item in result.fact_list],
|
|
48
|
+
)
|
|
@@ -1,18 +1,24 @@
|
|
|
1
|
-
from enum import
|
|
1
|
+
from enum import StrEnum
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
from pydantic import BaseModel
|
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
5
5
|
|
|
6
|
+
from unique_toolkit.agentic.evaluation.exception import EvaluatorException
|
|
6
7
|
from unique_toolkit.chat import ChatMessage
|
|
7
|
-
from unique_toolkit.
|
|
8
|
+
from unique_toolkit.chat.schemas import (
|
|
9
|
+
ChatMessageAssessmentLabel,
|
|
10
|
+
ChatMessageAssessmentStatus,
|
|
11
|
+
ChatMessageAssessmentType,
|
|
12
|
+
)
|
|
8
13
|
|
|
9
14
|
|
|
10
|
-
class EvaluationMetricName(
|
|
15
|
+
class EvaluationMetricName(StrEnum):
|
|
11
16
|
HALLUCINATION = "hallucination"
|
|
12
17
|
CONTEXT_RELEVANCY = "relevancy"
|
|
18
|
+
SUB_AGENT = "sub_agent"
|
|
13
19
|
|
|
14
20
|
|
|
15
|
-
class EvaluationMetricInputFieldName(
|
|
21
|
+
class EvaluationMetricInputFieldName(StrEnum):
|
|
16
22
|
INPUT_TEXT = "input_text"
|
|
17
23
|
CONTEXT_TEXTS = "context_texts"
|
|
18
24
|
HISTORY_MESSAGES = "history_messages"
|
|
@@ -38,7 +44,7 @@ class EvaluationMetricInput(BaseModel):
|
|
|
38
44
|
|
|
39
45
|
return "\n".join(
|
|
40
46
|
[
|
|
41
|
-
f"<{tag_name}-{index}>{text}</{tag_name}-{index}>"
|
|
47
|
+
f"<{tag_name}-{index + 1}>{text}</{tag_name}-{index + 1}>"
|
|
42
48
|
for index, text in enumerate(self.context_texts)
|
|
43
49
|
]
|
|
44
50
|
)
|
|
@@ -46,7 +52,7 @@ class EvaluationMetricInput(BaseModel):
|
|
|
46
52
|
def get_history_message_text(self, chat_message: ChatMessage):
|
|
47
53
|
return f"{chat_message.role.value}: {chat_message.content}"
|
|
48
54
|
|
|
49
|
-
def get_history_message_texts(self):
|
|
55
|
+
def get_history_message_texts(self) -> list[str]:
|
|
50
56
|
if not self.history_messages:
|
|
51
57
|
return []
|
|
52
58
|
return [self.get_history_message_text(msg) for msg in self.history_messages]
|
|
@@ -77,6 +83,20 @@ class EvaluationMetricInput(BaseModel):
|
|
|
77
83
|
|
|
78
84
|
|
|
79
85
|
class EvaluationMetricResult(BaseModel):
|
|
86
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
87
|
+
|
|
80
88
|
name: EvaluationMetricName
|
|
81
89
|
value: str
|
|
82
90
|
reason: str
|
|
91
|
+
is_positive: Optional[bool] = None
|
|
92
|
+
user_info: Optional[str] = None
|
|
93
|
+
error: Exception | None = None
|
|
94
|
+
fact_list: list[str] = Field(default_factory=list[str])
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class EvaluationAssessmentMessage(BaseModel):
|
|
98
|
+
status: ChatMessageAssessmentStatus
|
|
99
|
+
explanation: str
|
|
100
|
+
title: str
|
|
101
|
+
label: ChatMessageAssessmentLabel
|
|
102
|
+
type: ChatMessageAssessmentType
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
from unittest.mock import MagicMock, patch
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
|
|
6
|
+
from unique_toolkit.agentic.evaluation.context_relevancy.prompts import (
|
|
7
|
+
CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
|
|
8
|
+
)
|
|
9
|
+
from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
|
|
10
|
+
EvaluationSchemaStructuredOutput,
|
|
11
|
+
)
|
|
12
|
+
from unique_toolkit.agentic.evaluation.context_relevancy.service import (
|
|
13
|
+
ContextRelevancyEvaluator,
|
|
14
|
+
)
|
|
15
|
+
from unique_toolkit.agentic.evaluation.exception import EvaluatorException
|
|
16
|
+
from unique_toolkit.agentic.evaluation.schemas import (
|
|
17
|
+
EvaluationMetricInput,
|
|
18
|
+
EvaluationMetricName,
|
|
19
|
+
EvaluationMetricResult,
|
|
20
|
+
)
|
|
21
|
+
from unique_toolkit.app.schemas import ChatEvent
|
|
22
|
+
from unique_toolkit.chat.service import LanguageModelName
|
|
23
|
+
from unique_toolkit.language_model.infos import (
|
|
24
|
+
LanguageModelInfo,
|
|
25
|
+
)
|
|
26
|
+
from unique_toolkit.language_model.schemas import (
|
|
27
|
+
LanguageModelAssistantMessage,
|
|
28
|
+
LanguageModelCompletionChoice,
|
|
29
|
+
LanguageModelMessages,
|
|
30
|
+
)
|
|
31
|
+
from unique_toolkit.language_model.service import LanguageModelResponse
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@pytest.fixture
|
|
35
|
+
def event():
|
|
36
|
+
event = MagicMock(spec=ChatEvent)
|
|
37
|
+
event.payload = MagicMock()
|
|
38
|
+
event.payload.user_message = MagicMock()
|
|
39
|
+
event.payload.user_message.text = "Test query"
|
|
40
|
+
event.user_id = "user_0"
|
|
41
|
+
event.company_id = "company_0"
|
|
42
|
+
return event
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@pytest.fixture
|
|
46
|
+
def evaluator(event):
|
|
47
|
+
return ContextRelevancyEvaluator(event)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@pytest.fixture
|
|
51
|
+
def basic_config():
|
|
52
|
+
return EvaluationMetricConfig(
|
|
53
|
+
enabled=True,
|
|
54
|
+
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
55
|
+
language_model=LanguageModelInfo.from_name(
|
|
56
|
+
LanguageModelName.AZURE_GPT_4o_2024_0806
|
|
57
|
+
),
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@pytest.fixture
|
|
62
|
+
def structured_config(basic_config):
|
|
63
|
+
model_info = LanguageModelInfo.from_name(LanguageModelName.AZURE_GPT_4o_2024_0806)
|
|
64
|
+
return EvaluationMetricConfig(
|
|
65
|
+
enabled=True,
|
|
66
|
+
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
67
|
+
language_model=model_info,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@pytest.fixture
|
|
72
|
+
def sample_input():
|
|
73
|
+
return EvaluationMetricInput(
|
|
74
|
+
input_text="test query",
|
|
75
|
+
context_texts=["test context 1", "test context 2"],
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@pytest.mark.asyncio
|
|
80
|
+
async def test_analyze_disabled(evaluator, sample_input, basic_config):
|
|
81
|
+
basic_config.enabled = False
|
|
82
|
+
result = await evaluator.analyze(sample_input, basic_config)
|
|
83
|
+
assert result is None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@pytest.mark.asyncio
|
|
87
|
+
async def test_analyze_empty_context(evaluator, basic_config):
|
|
88
|
+
input_with_empty_context = EvaluationMetricInput(
|
|
89
|
+
input_text="test query", context_texts=[]
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
with pytest.raises(EvaluatorException) as exc_info:
|
|
93
|
+
await evaluator.analyze(input_with_empty_context, basic_config)
|
|
94
|
+
|
|
95
|
+
assert "No context texts provided." in str(exc_info.value)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@pytest.mark.asyncio
|
|
99
|
+
async def test_analyze_regular_output(evaluator, sample_input, basic_config):
|
|
100
|
+
mock_result = LanguageModelResponse(
|
|
101
|
+
choices=[
|
|
102
|
+
LanguageModelCompletionChoice(
|
|
103
|
+
index=0,
|
|
104
|
+
message=LanguageModelAssistantMessage(
|
|
105
|
+
content="""{
|
|
106
|
+
"value": "high",
|
|
107
|
+
"reason": "Test reason"
|
|
108
|
+
}"""
|
|
109
|
+
),
|
|
110
|
+
finish_reason="stop",
|
|
111
|
+
)
|
|
112
|
+
]
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
with patch.object(
|
|
116
|
+
evaluator.language_model_service,
|
|
117
|
+
"complete_async",
|
|
118
|
+
return_value=mock_result,
|
|
119
|
+
) as mock_complete:
|
|
120
|
+
result = await evaluator.analyze(sample_input, basic_config)
|
|
121
|
+
|
|
122
|
+
assert isinstance(result, EvaluationMetricResult)
|
|
123
|
+
assert result.value.lower() == "high"
|
|
124
|
+
mock_complete.assert_called_once()
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@pytest.mark.asyncio
|
|
128
|
+
async def test_analyze_structured_output(evaluator, sample_input, structured_config):
|
|
129
|
+
mock_result = LanguageModelResponse(
|
|
130
|
+
choices=[
|
|
131
|
+
LanguageModelCompletionChoice(
|
|
132
|
+
index=0,
|
|
133
|
+
message=LanguageModelAssistantMessage(
|
|
134
|
+
content="HIGH",
|
|
135
|
+
parsed={"value": "high", "reason": "Test reason"},
|
|
136
|
+
),
|
|
137
|
+
finish_reason="stop",
|
|
138
|
+
)
|
|
139
|
+
]
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
structured_output_schema = EvaluationSchemaStructuredOutput
|
|
143
|
+
|
|
144
|
+
with patch.object(
|
|
145
|
+
evaluator.language_model_service,
|
|
146
|
+
"complete_async",
|
|
147
|
+
return_value=mock_result,
|
|
148
|
+
) as mock_complete:
|
|
149
|
+
result = await evaluator.analyze(
|
|
150
|
+
sample_input, structured_config, structured_output_schema
|
|
151
|
+
)
|
|
152
|
+
assert isinstance(result, EvaluationMetricResult)
|
|
153
|
+
assert result.value.lower() == "high"
|
|
154
|
+
mock_complete.assert_called_once()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@pytest.mark.asyncio
|
|
158
|
+
async def test_analyze_structured_output_validation_error(
|
|
159
|
+
evaluator, sample_input, structured_config
|
|
160
|
+
):
|
|
161
|
+
mock_result = LanguageModelResponse(
|
|
162
|
+
choices=[
|
|
163
|
+
LanguageModelCompletionChoice(
|
|
164
|
+
index=0,
|
|
165
|
+
message=LanguageModelAssistantMessage(
|
|
166
|
+
content="HIGH", parsed={"invalid": "data"}
|
|
167
|
+
),
|
|
168
|
+
finish_reason="stop",
|
|
169
|
+
)
|
|
170
|
+
]
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
structured_output_schema = EvaluationSchemaStructuredOutput
|
|
174
|
+
|
|
175
|
+
with patch.object(
|
|
176
|
+
evaluator.language_model_service,
|
|
177
|
+
"complete_async",
|
|
178
|
+
return_value=mock_result,
|
|
179
|
+
):
|
|
180
|
+
with pytest.raises(EvaluatorException) as exc_info:
|
|
181
|
+
await evaluator.analyze(
|
|
182
|
+
sample_input, structured_config, structured_output_schema
|
|
183
|
+
)
|
|
184
|
+
assert "Error occurred during structured output validation" in str(
|
|
185
|
+
exc_info.value
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@pytest.mark.asyncio
|
|
190
|
+
async def test_analyze_regular_output_empty_response(
|
|
191
|
+
evaluator, sample_input, basic_config
|
|
192
|
+
):
|
|
193
|
+
mock_result = LanguageModelResponse(
|
|
194
|
+
choices=[
|
|
195
|
+
LanguageModelCompletionChoice(
|
|
196
|
+
index=0,
|
|
197
|
+
message=LanguageModelAssistantMessage(content=""),
|
|
198
|
+
finish_reason="stop",
|
|
199
|
+
)
|
|
200
|
+
]
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
with patch.object(
|
|
204
|
+
evaluator.language_model_service,
|
|
205
|
+
"complete_async",
|
|
206
|
+
return_value=mock_result,
|
|
207
|
+
):
|
|
208
|
+
with pytest.raises(EvaluatorException) as exc_info:
|
|
209
|
+
await evaluator.analyze(sample_input, basic_config)
|
|
210
|
+
assert "did not return a result" in str(exc_info.value)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def test_compose_msgs_regular(evaluator, sample_input, basic_config):
|
|
214
|
+
messages = evaluator._compose_msgs(
|
|
215
|
+
sample_input, basic_config, enable_structured_output=False
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
assert isinstance(messages, LanguageModelMessages)
|
|
219
|
+
assert messages.root[0].content == CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG
|
|
220
|
+
assert isinstance(messages.root[1].content, str)
|
|
221
|
+
assert "test query" in messages.root[1].content
|
|
222
|
+
assert "test context 1" in messages.root[1].content
|
|
223
|
+
assert "test context 2" in messages.root[1].content
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def test_compose_msgs_structured(evaluator, sample_input, structured_config):
|
|
227
|
+
messages = evaluator._compose_msgs(
|
|
228
|
+
sample_input, structured_config, enable_structured_output=True
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
assert isinstance(messages, LanguageModelMessages)
|
|
232
|
+
assert len(messages.root) == 2
|
|
233
|
+
assert (
|
|
234
|
+
messages.root[0].content != CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG
|
|
235
|
+
) # Should use structured output prompt
|
|
236
|
+
assert isinstance(messages.root[1].content, str)
|
|
237
|
+
assert "test query" in messages.root[1].content
|
|
238
|
+
assert "test context 1" in messages.root[1].content
|
|
239
|
+
assert "test context 2" in messages.root[1].content
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
@pytest.mark.asyncio
|
|
243
|
+
async def test_analyze_unknown_error(evaluator, sample_input, basic_config):
|
|
244
|
+
with patch.object(
|
|
245
|
+
evaluator.language_model_service,
|
|
246
|
+
"complete_async",
|
|
247
|
+
side_effect=Exception("Unknown error"),
|
|
248
|
+
):
|
|
249
|
+
with pytest.raises(EvaluatorException) as exc_info:
|
|
250
|
+
await evaluator.analyze(sample_input, basic_config)
|
|
251
|
+
assert "Unknown error occurred during context relevancy metric analysis" in str(
|
|
252
|
+
exc_info.value
|
|
253
|
+
)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
|
|
4
|
+
EvaluationSchemaStructuredOutput,
|
|
5
|
+
Fact,
|
|
6
|
+
)
|
|
7
|
+
from unique_toolkit.agentic.evaluation.exception import EvaluatorException
|
|
8
|
+
from unique_toolkit.agentic.evaluation.output_parser import (
|
|
9
|
+
parse_eval_metric_result,
|
|
10
|
+
parse_eval_metric_result_structured_output,
|
|
11
|
+
)
|
|
12
|
+
from unique_toolkit.agentic.evaluation.schemas import (
|
|
13
|
+
EvaluationMetricName,
|
|
14
|
+
EvaluationMetricResult,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_parse_eval_metric_result_success():
|
|
19
|
+
# Test successful parsing with all fields
|
|
20
|
+
result = '{"value": "high", "reason": "Test reason"}'
|
|
21
|
+
parsed = parse_eval_metric_result(result, EvaluationMetricName.CONTEXT_RELEVANCY)
|
|
22
|
+
|
|
23
|
+
assert isinstance(parsed, EvaluationMetricResult)
|
|
24
|
+
assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
|
|
25
|
+
assert parsed.value == "high"
|
|
26
|
+
assert parsed.reason == "Test reason"
|
|
27
|
+
assert parsed.fact_list == []
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_parse_eval_metric_result_missing_fields():
|
|
31
|
+
# Test parsing with missing fields (should use default "None")
|
|
32
|
+
result = '{"value": "high"}'
|
|
33
|
+
parsed = parse_eval_metric_result(result, EvaluationMetricName.CONTEXT_RELEVANCY)
|
|
34
|
+
|
|
35
|
+
assert isinstance(parsed, EvaluationMetricResult)
|
|
36
|
+
assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
|
|
37
|
+
assert parsed.value == "high"
|
|
38
|
+
assert parsed.reason == "None"
|
|
39
|
+
assert parsed.fact_list == []
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_parse_eval_metric_result_invalid_json():
|
|
43
|
+
# Test parsing with invalid JSON
|
|
44
|
+
result = "invalid json"
|
|
45
|
+
with pytest.raises(EvaluatorException) as exc_info:
|
|
46
|
+
parse_eval_metric_result(result, EvaluationMetricName.CONTEXT_RELEVANCY)
|
|
47
|
+
|
|
48
|
+
assert "Error occurred during parsing the evaluation metric result" in str(
|
|
49
|
+
exc_info.value
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_parse_eval_metric_result_structured_output_basic():
|
|
54
|
+
# Test basic structured output without fact list
|
|
55
|
+
result = EvaluationSchemaStructuredOutput(value="high", reason="Test reason")
|
|
56
|
+
parsed = parse_eval_metric_result_structured_output(
|
|
57
|
+
result, EvaluationMetricName.CONTEXT_RELEVANCY
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
assert isinstance(parsed, EvaluationMetricResult)
|
|
61
|
+
assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
|
|
62
|
+
assert parsed.value == "high"
|
|
63
|
+
assert parsed.reason == "Test reason"
|
|
64
|
+
assert parsed.fact_list == []
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_parse_eval_metric_result_structured_output_with_facts():
|
|
68
|
+
# Test structured output with fact list
|
|
69
|
+
result = EvaluationSchemaStructuredOutput(
|
|
70
|
+
value="high",
|
|
71
|
+
reason="Test reason",
|
|
72
|
+
fact_list=[
|
|
73
|
+
Fact(fact="Fact 1"),
|
|
74
|
+
Fact(fact="Fact 2"),
|
|
75
|
+
],
|
|
76
|
+
)
|
|
77
|
+
parsed = parse_eval_metric_result_structured_output(
|
|
78
|
+
result, EvaluationMetricName.CONTEXT_RELEVANCY
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
assert isinstance(parsed, EvaluationMetricResult)
|
|
82
|
+
assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
|
|
83
|
+
assert parsed.value == "high"
|
|
84
|
+
assert parsed.reason == "Test reason"
|
|
85
|
+
assert parsed.fact_list == ["Fact 1", "Fact 2"]
|
|
86
|
+
assert isinstance(parsed.fact_list, list)
|
|
87
|
+
assert len(parsed.fact_list) == 2 # None fact should be filtered out
|