unique_toolkit 0.8.12__tar.gz → 0.8.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/CHANGELOG.md +11 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/PKG-INFO +12 -1
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/pyproject.toml +1 -1
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/_common/validators.py +37 -2
- unique_toolkit-0.8.14/unique_toolkit/evals/evaluation_manager.py +206 -0
- unique_toolkit-0.8.14/unique_toolkit/evals/schemas.py +100 -0
- unique_toolkit-0.8.14/unique_toolkit/evaluators/exception.py +5 -0
- unique_toolkit-0.8.14/unique_toolkit/history_manager/history_manager.py +261 -0
- unique_toolkit-0.8.14/unique_toolkit/history_manager/utils.py +174 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/language_model/infos.py +119 -0
- unique_toolkit-0.8.14/unique_toolkit/postprocessor/postprocessor_manager.py +122 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/reference_manager/reference_manager.py +19 -0
- unique_toolkit-0.8.14/unique_toolkit/short_term_memory/persistent_short_term_memory_manager.py +140 -0
- unique_toolkit-0.8.14/unique_toolkit/thinking_manager/thinking_manager.py +102 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/schemas.py +0 -1
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/tool.py +1 -1
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/tool_manager.py +26 -10
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/LICENSE +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/README.md +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/__init__.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/_common/_base_service.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/_common/_time_utils.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/_common/exception.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/_common/validate_required_values.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/app/__init__.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/app/dev_util.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/app/init_logging.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/app/init_sdk.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/app/performance/async_tasks.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/app/performance/async_wrapper.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/app/schemas.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/app/unique_settings.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/app/verification.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/chat/__init__.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/chat/constants.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/chat/functions.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/chat/schemas.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/chat/service.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/chat/state.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/chat/utils.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/content/__init__.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/content/constants.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/content/functions.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/content/schemas.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/content/service.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/content/utils.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/embedding/__init__.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/embedding/constants.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/embedding/functions.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/embedding/schemas.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/embedding/service.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/embedding/utils.py +0 -0
- {unique_toolkit-0.8.12/unique_toolkit/evaluators → unique_toolkit-0.8.14/unique_toolkit/evals}/exception.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/__init__.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/config.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/constants.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/context_relevancy/constants.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/context_relevancy/prompts.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/context_relevancy/service.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/context_relevancy/utils.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/hallucination/constants.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/hallucination/prompts.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/hallucination/service.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/hallucination/utils.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/output_parser.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/evaluators/schemas.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/framework_utilities/langchain/client.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/framework_utilities/langchain/history.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/framework_utilities/openai/client.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/framework_utilities/openai/message_builder.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/framework_utilities/utils.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/language_model/__init__.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/language_model/builder.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/language_model/constants.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/language_model/functions.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/language_model/prompt.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/language_model/reference.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/language_model/schemas.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/language_model/service.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/language_model/utils.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/protocols/support.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/short_term_memory/__init__.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/short_term_memory/constants.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/short_term_memory/functions.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/short_term_memory/schemas.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/short_term_memory/service.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/smart_rules/__init__.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/smart_rules/compile.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/agent_chunks_handler.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/config.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/factory.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/test/test_tool_progress_reporter.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/tool_progress_reporter.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/utils/execution/execution.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/utils/source_handling/schema.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/utils/source_handling/source_formatting.py +0 -0
- {unique_toolkit-0.8.12 → unique_toolkit-0.8.14}/unique_toolkit/tools/utils/source_handling/tests/test_source_formatting.py +0 -0
|
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.8.14] - 2025-08-19
|
|
9
|
+
- Including GPT-5 series deployed via LiteLLM into language model info
|
|
10
|
+
|
|
11
|
+
## [0.8.13] - 2025-08-18
|
|
12
|
+
- Adding initial versions of
|
|
13
|
+
- Evaluation Manager
|
|
14
|
+
- History Manager
|
|
15
|
+
- Postprocessor Manager
|
|
16
|
+
- Thinking Manager
|
|
17
|
+
- Updated tool manager
|
|
18
|
+
|
|
8
19
|
## [0.8.12] - 2025-08-18
|
|
9
20
|
- Fix no tool call respoonse in ChatMessage -> Open Ai messages translation
|
|
10
21
|
- Add simple append method to OpenAIMessageBuilder
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unique_toolkit
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.14
|
|
4
4
|
Summary:
|
|
5
5
|
License: Proprietary
|
|
6
6
|
Author: Martin Fadler
|
|
@@ -114,6 +114,17 @@ All notable changes to this project will be documented in this file.
|
|
|
114
114
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
115
115
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
116
116
|
|
|
117
|
+
## [0.8.14] - 2025-08-19
|
|
118
|
+
- Including GPT-5 series deployed via LiteLLM into language model info
|
|
119
|
+
|
|
120
|
+
## [0.8.13] - 2025-08-18
|
|
121
|
+
- Adding initial versions of
|
|
122
|
+
- Evaluation Manager
|
|
123
|
+
- History Manager
|
|
124
|
+
- Postprocessor Manager
|
|
125
|
+
- Thinking Manager
|
|
126
|
+
- Updated tool manager
|
|
127
|
+
|
|
117
128
|
## [0.8.12] - 2025-08-18
|
|
118
129
|
- Fix no tool call respoonse in ChatMessage -> Open Ai messages translation
|
|
119
130
|
- Add simple append method to OpenAIMessageBuilder
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Annotated, Any
|
|
2
3
|
|
|
3
|
-
from pydantic import BeforeValidator, PlainSerializer
|
|
4
|
+
from pydantic import BeforeValidator, Field, PlainSerializer, ValidationInfo
|
|
4
5
|
|
|
5
6
|
from unique_toolkit.language_model import LanguageModelName
|
|
6
7
|
from unique_toolkit.language_model.infos import (
|
|
@@ -8,6 +9,11 @@ from unique_toolkit.language_model.infos import (
|
|
|
8
9
|
LanguageModelProvider,
|
|
9
10
|
)
|
|
10
11
|
|
|
12
|
+
from pydantic.fields import FieldInfo
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
11
17
|
# TODO @klcd: Inform on deprecation of str as input
|
|
12
18
|
LMI = Annotated[
|
|
13
19
|
LanguageModelInfo,
|
|
@@ -55,3 +61,32 @@ def validate_and_init_language_model_info(
|
|
|
55
61
|
)
|
|
56
62
|
|
|
57
63
|
return v
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def ClipInt(*, min_value: int, max_value: int) -> tuple[BeforeValidator, FieldInfo]:
|
|
67
|
+
def _validator(value: Any, info: ValidationInfo) -> Any:
|
|
68
|
+
if not isinstance(value, int):
|
|
69
|
+
value = int(value)
|
|
70
|
+
|
|
71
|
+
field_name = info.field_name
|
|
72
|
+
if value < min_value:
|
|
73
|
+
logger.warning(
|
|
74
|
+
"Field %s is below the allowed minimum of %s. It will be set to %s.",
|
|
75
|
+
field_name,
|
|
76
|
+
min_value,
|
|
77
|
+
min_value,
|
|
78
|
+
)
|
|
79
|
+
return min_value
|
|
80
|
+
|
|
81
|
+
if value > max_value:
|
|
82
|
+
logger.warning(
|
|
83
|
+
"Field %s is above the allowed maximum of %s. It will be set to %s.",
|
|
84
|
+
field_name,
|
|
85
|
+
max_value,
|
|
86
|
+
max_value,
|
|
87
|
+
)
|
|
88
|
+
return max_value
|
|
89
|
+
|
|
90
|
+
return value
|
|
91
|
+
|
|
92
|
+
return (BeforeValidator(_validator), Field(ge=min_value, le=max_value))
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
import asyncio
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
from unique_toolkit.tools.utils.execution.execution import Result, SafeTaskExecutor
|
|
6
|
+
from logging import Logger
|
|
7
|
+
from unique_toolkit.evals.schemas import (
|
|
8
|
+
EvaluationAssessmentMessage,
|
|
9
|
+
EvaluationMetricName,
|
|
10
|
+
EvaluationMetricResult,
|
|
11
|
+
)
|
|
12
|
+
from unique_toolkit.chat.schemas import (
|
|
13
|
+
ChatMessageAssessmentStatus,
|
|
14
|
+
ChatMessageAssessmentType,
|
|
15
|
+
)
|
|
16
|
+
from unique_toolkit.chat.service import ChatService
|
|
17
|
+
from unique_toolkit.language_model.schemas import (
|
|
18
|
+
LanguageModelStreamResponse,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Evaluation(ABC):
|
|
23
|
+
"""
|
|
24
|
+
Abstract base class for evaluation metrics.
|
|
25
|
+
|
|
26
|
+
typical use cases include:
|
|
27
|
+
- Hallucination checking
|
|
28
|
+
- compliance checking
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def __init__(self, name: EvaluationMetricName):
|
|
33
|
+
self.name = name
|
|
34
|
+
|
|
35
|
+
def get_name(self) -> EvaluationMetricName:
|
|
36
|
+
return self.name
|
|
37
|
+
|
|
38
|
+
def get_assessment_type(self) -> ChatMessageAssessmentType:
|
|
39
|
+
raise NotImplementedError(
|
|
40
|
+
"Subclasses must implement this method to return the assessment type."
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
async def run(
|
|
44
|
+
self, loop_response: LanguageModelStreamResponse
|
|
45
|
+
) -> EvaluationMetricResult:
|
|
46
|
+
raise NotImplementedError("Subclasses must implement this method.")
|
|
47
|
+
|
|
48
|
+
async def evaluation_metric_to_assessment(
|
|
49
|
+
self, evaluation_result: EvaluationMetricResult
|
|
50
|
+
) -> EvaluationAssessmentMessage:
|
|
51
|
+
raise NotImplementedError(
|
|
52
|
+
"Subclasses must implement this method to convert evaluation results to assessment messages."
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class EvaluationManager:
|
|
57
|
+
"""
|
|
58
|
+
Manages the evaluation metrics and executes evaluation calls.
|
|
59
|
+
|
|
60
|
+
This class is responsible for:
|
|
61
|
+
- Storing and managing evaluation instances, identified by their unique names.
|
|
62
|
+
- Executing selected evaluations asynchronously and processing their results.
|
|
63
|
+
- Tracking the overall success or failure of evaluations.
|
|
64
|
+
- Integrating with external services like logging and chat systems to display evaluation statuses and results.
|
|
65
|
+
- Handling errors gracefully, including missing evaluations or failed executions.
|
|
66
|
+
|
|
67
|
+
Key Features:
|
|
68
|
+
- Evaluation Storage: Maintains a dictionary of evaluation instances for quick retrieval.
|
|
69
|
+
- Asynchronous Execution: Supports concurrent execution of multiple evaluations for efficiency.
|
|
70
|
+
- Result Processing: Tracks evaluation outcomes and updates the internal state based on results.
|
|
71
|
+
- Chat Integration: Updates the chat interface with evaluation statuses and detailed assessments.
|
|
72
|
+
- Error Handling: Provides robust error messages and fallback mechanisms for missing or failed evaluations.
|
|
73
|
+
|
|
74
|
+
The EvaluationManager serves as the central hub for managing and executing evaluations.
|
|
75
|
+
"""
|
|
76
|
+
# a hashmap to hold evaluations by their names
|
|
77
|
+
_evaluations: dict[EvaluationMetricName, Evaluation] = {}
|
|
78
|
+
_evaluation_passed: bool = True
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
logger: Logger,
|
|
83
|
+
chat_service: ChatService,
|
|
84
|
+
assistant_message_id: str,
|
|
85
|
+
):
|
|
86
|
+
self._logger = logger
|
|
87
|
+
self._chat_service = chat_service
|
|
88
|
+
self._assistant_message_id = assistant_message_id
|
|
89
|
+
|
|
90
|
+
def add_evaluation(self, evaluation: Evaluation):
|
|
91
|
+
self._evaluations[evaluation.get_name()] = evaluation
|
|
92
|
+
|
|
93
|
+
def get_evaluation_by_name(self, name: EvaluationMetricName) -> Evaluation | None:
|
|
94
|
+
return self._evaluations.get(name)
|
|
95
|
+
|
|
96
|
+
async def run_evaluations(
|
|
97
|
+
self,
|
|
98
|
+
selected_evaluation_names: list[EvaluationMetricName],
|
|
99
|
+
loop_response: LanguageModelStreamResponse,
|
|
100
|
+
) -> list[EvaluationMetricResult]:
|
|
101
|
+
task_executor = SafeTaskExecutor(
|
|
102
|
+
logger=self._logger,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
tasks = [
|
|
106
|
+
task_executor.execute_async(
|
|
107
|
+
self.execute_evaluation_call,
|
|
108
|
+
loop_response=loop_response,
|
|
109
|
+
evaluation_name=evaluation_name,
|
|
110
|
+
)
|
|
111
|
+
for evaluation_name in selected_evaluation_names
|
|
112
|
+
]
|
|
113
|
+
evaluation_results = await asyncio.gather(*tasks)
|
|
114
|
+
evaluation_results_unpacked: list[EvaluationMetricResult] = []
|
|
115
|
+
|
|
116
|
+
for i, result in enumerate(evaluation_results):
|
|
117
|
+
unpacked_evaluation_result = self._create_evaluation_metric_result(
|
|
118
|
+
result, selected_evaluation_names[i]
|
|
119
|
+
)
|
|
120
|
+
if not unpacked_evaluation_result.is_positive:
|
|
121
|
+
self._evaluation_passed = False
|
|
122
|
+
evaluation_results_unpacked.append(unpacked_evaluation_result)
|
|
123
|
+
|
|
124
|
+
return evaluation_results_unpacked
|
|
125
|
+
|
|
126
|
+
async def execute_evaluation_call(
|
|
127
|
+
self,
|
|
128
|
+
evaluation_name: EvaluationMetricName,
|
|
129
|
+
loop_response: LanguageModelStreamResponse,
|
|
130
|
+
) -> EvaluationMetricResult:
|
|
131
|
+
self._logger.info(f"Processing tool call: {evaluation_name}")
|
|
132
|
+
|
|
133
|
+
evaluation_instance = self.get_evaluation_by_name(evaluation_name)
|
|
134
|
+
|
|
135
|
+
if evaluation_instance:
|
|
136
|
+
# Execute the evaluation
|
|
137
|
+
await self._create_assistant_message(evaluation_instance)
|
|
138
|
+
evaluation_metric_result: EvaluationMetricResult = (
|
|
139
|
+
await evaluation_instance.run(loop_response)
|
|
140
|
+
)
|
|
141
|
+
# show results to the user
|
|
142
|
+
await self._show_message_assessment(
|
|
143
|
+
evaluation_instance, evaluation_metric_result
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return evaluation_metric_result
|
|
147
|
+
|
|
148
|
+
return EvaluationMetricResult(
|
|
149
|
+
name=evaluation_name,
|
|
150
|
+
is_positive=True,
|
|
151
|
+
value="RED",
|
|
152
|
+
reason=f"Evaluation named {evaluation_name} not found",
|
|
153
|
+
error=Exception("Evaluation named {evaluation_name} not found"),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
def _create_evaluation_metric_result(
|
|
157
|
+
self,
|
|
158
|
+
result: Result[EvaluationMetricResult],
|
|
159
|
+
evaluation_name: EvaluationMetricName,
|
|
160
|
+
) -> EvaluationMetricResult:
|
|
161
|
+
if not result.success:
|
|
162
|
+
return EvaluationMetricResult(
|
|
163
|
+
name=evaluation_name,
|
|
164
|
+
is_positive=True,
|
|
165
|
+
value="RED",
|
|
166
|
+
reason=str(result.exception),
|
|
167
|
+
error=Exception("Evaluation result is not successful"),
|
|
168
|
+
)
|
|
169
|
+
unpacked = result.unpack()
|
|
170
|
+
if not isinstance(unpacked, EvaluationMetricResult):
|
|
171
|
+
return EvaluationMetricResult(
|
|
172
|
+
name=evaluation_name,
|
|
173
|
+
is_positive=True,
|
|
174
|
+
value="RED",
|
|
175
|
+
reason="Evaluation result is not of type EvaluationMetricResult",
|
|
176
|
+
error=Exception(
|
|
177
|
+
"Evaluation result is not of type EvaluationMetricResult"
|
|
178
|
+
),
|
|
179
|
+
)
|
|
180
|
+
return unpacked
|
|
181
|
+
|
|
182
|
+
async def _show_message_assessment(
|
|
183
|
+
self,
|
|
184
|
+
evaluation_instance: Evaluation,
|
|
185
|
+
evaluation_metric_result: EvaluationMetricResult,
|
|
186
|
+
) -> None:
|
|
187
|
+
evaluation_assessment_message = (
|
|
188
|
+
await evaluation_instance.evaluation_metric_to_assessment(
|
|
189
|
+
evaluation_metric_result
|
|
190
|
+
)
|
|
191
|
+
)
|
|
192
|
+
await self._chat_service.modify_message_assessment_async(
|
|
193
|
+
assistant_message_id=self._assistant_message_id,
|
|
194
|
+
status=evaluation_assessment_message.status,
|
|
195
|
+
title=evaluation_assessment_message.title,
|
|
196
|
+
explanation=evaluation_assessment_message.explanation,
|
|
197
|
+
label=evaluation_assessment_message.label,
|
|
198
|
+
type=evaluation_assessment_message.type,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
async def _create_assistant_message(self, evaluation_instance: Evaluation):
|
|
202
|
+
await self._chat_service.create_message_assessment_async(
|
|
203
|
+
assistant_message_id=self._assistant_message_id,
|
|
204
|
+
status=ChatMessageAssessmentStatus.PENDING,
|
|
205
|
+
type=evaluation_instance.get_assessment_type(),
|
|
206
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from enum import StrEnum
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
5
|
+
from unique_toolkit.chat import ChatMessage
|
|
6
|
+
from unique_toolkit.evals.exception import EvaluatorException
|
|
7
|
+
from unique_toolkit.chat.schemas import (
|
|
8
|
+
ChatMessageAssessmentLabel,
|
|
9
|
+
ChatMessageAssessmentStatus,
|
|
10
|
+
ChatMessageAssessmentType,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EvaluationMetricName(StrEnum):
|
|
15
|
+
HALLUCINATION = "hallucination"
|
|
16
|
+
CONTEXT_RELEVANCY = "relevancy"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EvaluationMetricInputFieldName(StrEnum):
|
|
20
|
+
INPUT_TEXT = "input_text"
|
|
21
|
+
CONTEXT_TEXTS = "context_texts"
|
|
22
|
+
HISTORY_MESSAGES = "history_messages"
|
|
23
|
+
OUTPUT_TEXT = "output_text"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class EvaluationMetricInput(BaseModel):
|
|
27
|
+
"""
|
|
28
|
+
Input for any metric evaluation. Depending on the metric, the input can be different.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
input_text: Optional[str] = None
|
|
32
|
+
context_texts: Optional[list[str]] = None
|
|
33
|
+
history_messages: Optional[list[ChatMessage]] = None
|
|
34
|
+
output_text: Optional[str] = None
|
|
35
|
+
|
|
36
|
+
def get_joined_context_texts(self, tag_name: str = "reference") -> str:
|
|
37
|
+
"""
|
|
38
|
+
Concatenates context_texts.
|
|
39
|
+
"""
|
|
40
|
+
if not self.context_texts:
|
|
41
|
+
return f"<No {tag_name} texts provided>"
|
|
42
|
+
|
|
43
|
+
return "\n".join(
|
|
44
|
+
[
|
|
45
|
+
f"<{tag_name}-{index + 1}>{text}</{tag_name}-{index + 1}>"
|
|
46
|
+
for index, text in enumerate(self.context_texts)
|
|
47
|
+
]
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def get_history_message_text(self, chat_message: ChatMessage):
|
|
51
|
+
return f"{chat_message.role.value}: {chat_message.content}"
|
|
52
|
+
|
|
53
|
+
def get_history_message_texts(self) -> list[str]:
|
|
54
|
+
if not self.history_messages:
|
|
55
|
+
return []
|
|
56
|
+
return [self.get_history_message_text(msg) for msg in self.history_messages]
|
|
57
|
+
|
|
58
|
+
def get_joined_history_texts(self, tag_name: str = "conversation") -> str:
|
|
59
|
+
"""
|
|
60
|
+
Concatenates history message texts.
|
|
61
|
+
"""
|
|
62
|
+
if not self.history_messages:
|
|
63
|
+
return f"<No {tag_name} texts provided>"
|
|
64
|
+
|
|
65
|
+
return "\n".join(self.get_history_message_texts())
|
|
66
|
+
|
|
67
|
+
def validate_required_fields(
|
|
68
|
+
self, required_fields: list[EvaluationMetricInputFieldName]
|
|
69
|
+
):
|
|
70
|
+
"""
|
|
71
|
+
Validates the input fields for the hallucination metric.
|
|
72
|
+
"""
|
|
73
|
+
for field in required_fields:
|
|
74
|
+
value = getattr(self, field)
|
|
75
|
+
if value is None:
|
|
76
|
+
error_message = f"Missing required input field: {field}"
|
|
77
|
+
raise EvaluatorException(
|
|
78
|
+
user_message=error_message,
|
|
79
|
+
error_message=error_message,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class EvaluationMetricResult(BaseModel):
|
|
84
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
85
|
+
|
|
86
|
+
name: EvaluationMetricName
|
|
87
|
+
value: str
|
|
88
|
+
reason: str
|
|
89
|
+
is_positive: Optional[bool] = None
|
|
90
|
+
user_info: Optional[str] = None
|
|
91
|
+
error: Exception | None = None
|
|
92
|
+
fact_list: list[str] = Field(default_factory=list[str])
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class EvaluationAssessmentMessage(BaseModel):
|
|
96
|
+
status: ChatMessageAssessmentStatus
|
|
97
|
+
explanation: str
|
|
98
|
+
title: str
|
|
99
|
+
label: ChatMessageAssessmentLabel
|
|
100
|
+
type: ChatMessageAssessmentType
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from logging import Logger
|
|
3
|
+
from typing import Awaitable, Callable
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from unique_toolkit.app.schemas import ChatEvent
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
from unique_toolkit.chat.schemas import ChatMessage
|
|
11
|
+
from unique_toolkit.chat.service import ChatService
|
|
12
|
+
from unique_toolkit.content.schemas import Content
|
|
13
|
+
from unique_toolkit.content.service import ContentService
|
|
14
|
+
from unique_toolkit.language_model.builder import MessagesBuilder
|
|
15
|
+
from unique_toolkit.language_model.schemas import (
|
|
16
|
+
LanguageModelAssistantMessage,
|
|
17
|
+
LanguageModelFunction,
|
|
18
|
+
LanguageModelMessage,
|
|
19
|
+
LanguageModelMessageRole,
|
|
20
|
+
LanguageModelToolMessage,
|
|
21
|
+
LanguageModelUserMessage,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
from unique_toolkit.tools.schemas import ToolCallResponse
|
|
25
|
+
from unique_toolkit.content.utils import count_tokens
|
|
26
|
+
from unique_toolkit.history_manager.utils import transform_chunks_to_string
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class HistoryManagerConfig(BaseModel):
|
|
30
|
+
|
|
31
|
+
class ExperimentalFeatures(BaseModel):
|
|
32
|
+
def __init__(self, full_sources_serialize_dump: bool = False):
|
|
33
|
+
self.full_sources_serialize_dump = full_sources_serialize_dump
|
|
34
|
+
|
|
35
|
+
full_sources_serialize_dump: bool = Field(
|
|
36
|
+
default=False,
|
|
37
|
+
description="If True, the sources will be serialized in full, otherwise only the content will be serialized.",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
experimental_features: ExperimentalFeatures = Field(
|
|
41
|
+
default=ExperimentalFeatures(),
|
|
42
|
+
description="Experimental features for the history manager.",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
max_history_tokens: int = Field(
|
|
46
|
+
default=8000,
|
|
47
|
+
ge=0,
|
|
48
|
+
description="The maximum number of tokens to keep in the history.",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class HistoryManager:
|
|
53
|
+
"""
|
|
54
|
+
Manages the history of tool calls and conversation loops.
|
|
55
|
+
|
|
56
|
+
This class is responsible for:
|
|
57
|
+
- Storing and maintaining the history of tool call results and conversation messages.
|
|
58
|
+
- Merging uploaded content with the conversation history for a unified view.
|
|
59
|
+
- Limiting the history to fit within a configurable token window for efficient processing.
|
|
60
|
+
- Providing methods to retrieve, manipulate, and append to the conversation history.
|
|
61
|
+
- Handling post-processing steps to clean or modify the history as needed.
|
|
62
|
+
|
|
63
|
+
Key Features:
|
|
64
|
+
- Tool Call History: Tracks the results of tool calls and appends them to the conversation history.
|
|
65
|
+
- Loop History: Maintains a record of conversation loops, including assistant and user messages.
|
|
66
|
+
- History Merging: Combines uploaded files and chat messages into a cohesive history.
|
|
67
|
+
- Token Window Management: Ensures the history stays within a specified token limit for optimal performance.
|
|
68
|
+
- Post-Processing Support: Allows for custom transformations or cleanup of the conversation history.
|
|
69
|
+
|
|
70
|
+
The HistoryManager serves as the backbone for managing and retrieving conversation history in a structured and efficient manner.
|
|
71
|
+
"""
|
|
72
|
+
_tool_call_result_history: list[ToolCallResponse] = []
|
|
73
|
+
_loop_history: list[LanguageModelMessage] = []
|
|
74
|
+
_source_enumerator = 0
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
logger: Logger,
|
|
79
|
+
event: ChatEvent,
|
|
80
|
+
config: HistoryManagerConfig,
|
|
81
|
+
):
|
|
82
|
+
self._config = config
|
|
83
|
+
self._logger = logger
|
|
84
|
+
self._chat_service = ChatService(event)
|
|
85
|
+
self._content_service = ContentService.from_event(event)
|
|
86
|
+
|
|
87
|
+
def has_no_loop_messages(self) -> bool:
|
|
88
|
+
return len(self._loop_history) == 0
|
|
89
|
+
|
|
90
|
+
def add_tool_call_results(self, tool_call_results: list[ToolCallResponse]):
|
|
91
|
+
for tool_response in tool_call_results:
|
|
92
|
+
if not tool_response.successful:
|
|
93
|
+
self._loop_history.append(
|
|
94
|
+
LanguageModelToolMessage(
|
|
95
|
+
name=tool_response.name,
|
|
96
|
+
tool_call_id=tool_response.id,
|
|
97
|
+
content=f"Tool call {tool_response.name} failed with error: {tool_response.error_message}",
|
|
98
|
+
)
|
|
99
|
+
)
|
|
100
|
+
continue
|
|
101
|
+
self._append_tool_call_result_to_history(tool_response)
|
|
102
|
+
|
|
103
|
+
def _append_tool_call_result_to_history(
|
|
104
|
+
self,
|
|
105
|
+
tool_response: ToolCallResponse,
|
|
106
|
+
) -> None:
|
|
107
|
+
tool_call_result_for_history = self._get_tool_call_result_for_loop_history(
|
|
108
|
+
tool_response=tool_response
|
|
109
|
+
)
|
|
110
|
+
self._loop_history.append(tool_call_result_for_history)
|
|
111
|
+
|
|
112
|
+
def _get_tool_call_result_for_loop_history(
|
|
113
|
+
self,
|
|
114
|
+
tool_response: ToolCallResponse,
|
|
115
|
+
) -> LanguageModelMessage:
|
|
116
|
+
self._logger.debug(
|
|
117
|
+
f"Appending tool call result to history: {tool_response.name}"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
content_chunks = (
|
|
121
|
+
tool_response.content_chunks or []
|
|
122
|
+
) # it can be that the tool response does not have content chunks
|
|
123
|
+
|
|
124
|
+
# Transform content chunks into sources to be appended to tool result
|
|
125
|
+
sources = transform_chunks_to_string(
|
|
126
|
+
content_chunks,
|
|
127
|
+
self._source_enumerator,
|
|
128
|
+
None, # Use None for SourceFormatConfig
|
|
129
|
+
self._config.experimental_features.full_sources_serialize_dump,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
self._source_enumerator += len(
|
|
133
|
+
sources
|
|
134
|
+
) # To make sure all sources have unique source numbers
|
|
135
|
+
|
|
136
|
+
# Append the result to the history
|
|
137
|
+
return LanguageModelToolMessage(
|
|
138
|
+
content=sources,
|
|
139
|
+
tool_call_id=tool_response.id, # type: ignore
|
|
140
|
+
name=tool_response.name,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
def _append_tool_calls_to_history(
|
|
144
|
+
self, tool_calls: list[LanguageModelFunction]
|
|
145
|
+
) -> None:
|
|
146
|
+
self._loop_history.append(
|
|
147
|
+
LanguageModelAssistantMessage.from_functions(tool_calls=tool_calls)
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def add_assistant_message(self, message: LanguageModelAssistantMessage) -> None:
|
|
151
|
+
self._loop_history.append(message)
|
|
152
|
+
|
|
153
|
+
async def get_history(
|
|
154
|
+
self,
|
|
155
|
+
postprocessing_step: Callable[
|
|
156
|
+
[list[LanguageModelMessage]], list[LanguageModelMessage]
|
|
157
|
+
]
|
|
158
|
+
| None = None,
|
|
159
|
+
) -> list[LanguageModelMessage]:
|
|
160
|
+
"""
|
|
161
|
+
Get the history of the conversation. The function will retrieve a subset of the full history based on the configuration.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
list[LanguageModelMessage]: The history
|
|
165
|
+
"""
|
|
166
|
+
# Get uploaded files
|
|
167
|
+
uploaded_files = self._content_service.search_content_on_chat(
|
|
168
|
+
chat_id=self._chat_service.chat_id
|
|
169
|
+
)
|
|
170
|
+
# Get all message history
|
|
171
|
+
full_history = await self._chat_service.get_full_history_async()
|
|
172
|
+
|
|
173
|
+
merged_history = self._merge_history_and_uploads(full_history, uploaded_files)
|
|
174
|
+
|
|
175
|
+
if postprocessing_step is not None:
|
|
176
|
+
merged_history = postprocessing_step(merged_history)
|
|
177
|
+
|
|
178
|
+
limited_history = self._limit_to_token_window(
|
|
179
|
+
merged_history, self._config.max_history_tokens
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Add current user message if not already in history
|
|
183
|
+
# we grab it fresh from the db so it must contain all the messages this code is not needed anymore below currently it's left in for explainability
|
|
184
|
+
# current_user_msg = LanguageModelUserMessage(
|
|
185
|
+
# content=self.event.payload.user_message.text
|
|
186
|
+
# )
|
|
187
|
+
# if not any(
|
|
188
|
+
# msg.role == LanguageModelMessageRole.USER
|
|
189
|
+
# and msg.content == current_user_msg.content
|
|
190
|
+
# for msg in complete_history
|
|
191
|
+
# ):
|
|
192
|
+
# complete_history.append(current_user_msg)
|
|
193
|
+
|
|
194
|
+
# # Add final assistant response - this should be available when this method is called
|
|
195
|
+
# if (
|
|
196
|
+
# hasattr(self, "loop_response")
|
|
197
|
+
# and self.loop_response
|
|
198
|
+
# and self.loop_response.message.text
|
|
199
|
+
# ):
|
|
200
|
+
# complete_history.append(
|
|
201
|
+
# LanguageModelAssistantMessage(
|
|
202
|
+
# content=self.loop_response.message.text
|
|
203
|
+
# )
|
|
204
|
+
# )
|
|
205
|
+
# else:
|
|
206
|
+
# self.logger.warning(
|
|
207
|
+
# "Called get_complete_conversation_history_after_streaming_no_tool_calls but no loop_response.message.text is available"
|
|
208
|
+
# )
|
|
209
|
+
|
|
210
|
+
return limited_history
|
|
211
|
+
|
|
212
|
+
def _merge_history_and_uploads(
|
|
213
|
+
self, history: list[ChatMessage], uploads: list[Content]
|
|
214
|
+
) -> list[LanguageModelMessage]:
|
|
215
|
+
# Assert that all content have a created_at
|
|
216
|
+
content_with_created_at = [content for content in uploads if content.created_at]
|
|
217
|
+
sorted_history = sorted(
|
|
218
|
+
history + content_with_created_at,
|
|
219
|
+
key=lambda x: x.created_at or datetime.min,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
msg_builder = MessagesBuilder()
|
|
223
|
+
for msg in sorted_history:
|
|
224
|
+
if isinstance(msg, Content):
|
|
225
|
+
msg_builder.user_message_append(
|
|
226
|
+
f"Uploaded file: {msg.key}, ContentId: {msg.id}"
|
|
227
|
+
)
|
|
228
|
+
else:
|
|
229
|
+
msg_builder.messages.append(
|
|
230
|
+
LanguageModelMessage(
|
|
231
|
+
role=LanguageModelMessageRole(msg.role),
|
|
232
|
+
content=msg.content,
|
|
233
|
+
)
|
|
234
|
+
)
|
|
235
|
+
return msg_builder.messages
|
|
236
|
+
|
|
237
|
+
def _limit_to_token_window(
|
|
238
|
+
self, messages: list[LanguageModelMessage], token_limit: int
|
|
239
|
+
) -> list[LanguageModelMessage]:
|
|
240
|
+
selected_messages = []
|
|
241
|
+
token_count = 0
|
|
242
|
+
for msg in messages[::-1]:
|
|
243
|
+
msg_token_count = count_tokens(str(msg.content))
|
|
244
|
+
if token_count + msg_token_count > token_limit:
|
|
245
|
+
break
|
|
246
|
+
selected_messages.append(msg)
|
|
247
|
+
token_count += msg_token_count
|
|
248
|
+
return selected_messages[::-1]
|
|
249
|
+
|
|
250
|
+
async def remove_post_processing_manipulations(
|
|
251
|
+
self, remove_from_text: Callable[[str], Awaitable[str]]
|
|
252
|
+
) -> list[LanguageModelMessage]:
|
|
253
|
+
messages = await self.get_history()
|
|
254
|
+
for message in messages:
|
|
255
|
+
if isinstance(message.content, str):
|
|
256
|
+
message.content = await remove_from_text(message.content)
|
|
257
|
+
else:
|
|
258
|
+
self._logger.warning(
|
|
259
|
+
f"Skipping message with unsupported content type: {type(message.content)}"
|
|
260
|
+
)
|
|
261
|
+
return messages
|