unique_toolkit 1.8.1__py3-none-any.whl → 1.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unique_toolkit might be problematic. Click here for more details.
- unique_toolkit/__init__.py +20 -0
- unique_toolkit/_common/api_calling/human_verification_manager.py +121 -28
- unique_toolkit/_common/chunk_relevancy_sorter/config.py +3 -3
- unique_toolkit/_common/chunk_relevancy_sorter/tests/test_service.py +2 -5
- unique_toolkit/_common/default_language_model.py +9 -3
- unique_toolkit/_common/docx_generator/__init__.py +7 -0
- unique_toolkit/_common/docx_generator/config.py +12 -0
- unique_toolkit/_common/docx_generator/schemas.py +80 -0
- unique_toolkit/_common/docx_generator/service.py +252 -0
- unique_toolkit/_common/docx_generator/template/Doc Template.docx +0 -0
- unique_toolkit/_common/endpoint_builder.py +138 -117
- unique_toolkit/_common/endpoint_requestor.py +240 -14
- unique_toolkit/_common/exception.py +20 -0
- unique_toolkit/_common/feature_flags/schema.py +1 -5
- unique_toolkit/_common/referencing.py +53 -0
- unique_toolkit/_common/string_utilities.py +52 -1
- unique_toolkit/_common/tests/test_referencing.py +521 -0
- unique_toolkit/_common/tests/test_string_utilities.py +506 -0
- unique_toolkit/_common/utils/files.py +43 -0
- unique_toolkit/agentic/debug_info_manager/debug_info_manager.py +16 -6
- unique_toolkit/agentic/debug_info_manager/test/test_debug_info_manager.py +278 -0
- unique_toolkit/agentic/evaluation/config.py +3 -2
- unique_toolkit/agentic/evaluation/context_relevancy/service.py +2 -2
- unique_toolkit/agentic/evaluation/evaluation_manager.py +9 -5
- unique_toolkit/agentic/evaluation/hallucination/constants.py +1 -1
- unique_toolkit/agentic/evaluation/hallucination/hallucination_evaluation.py +26 -3
- unique_toolkit/agentic/history_manager/history_manager.py +14 -11
- unique_toolkit/agentic/history_manager/loop_token_reducer.py +3 -4
- unique_toolkit/agentic/history_manager/utils.py +10 -87
- unique_toolkit/agentic/postprocessor/postprocessor_manager.py +107 -16
- unique_toolkit/agentic/reference_manager/reference_manager.py +1 -1
- unique_toolkit/agentic/responses_api/__init__.py +19 -0
- unique_toolkit/agentic/responses_api/postprocessors/code_display.py +63 -0
- unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +145 -0
- unique_toolkit/agentic/responses_api/stream_handler.py +15 -0
- unique_toolkit/agentic/tools/a2a/__init__.py +18 -2
- unique_toolkit/agentic/tools/a2a/evaluation/__init__.py +2 -0
- unique_toolkit/agentic/tools/a2a/evaluation/_utils.py +3 -3
- unique_toolkit/agentic/tools/a2a/evaluation/config.py +1 -1
- unique_toolkit/agentic/tools/a2a/evaluation/evaluator.py +143 -91
- unique_toolkit/agentic/tools/a2a/manager.py +7 -1
- unique_toolkit/agentic/tools/a2a/postprocessing/__init__.py +11 -3
- unique_toolkit/agentic/tools/a2a/postprocessing/_display_utils.py +185 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/_ref_utils.py +73 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/config.py +21 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/display.py +180 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/references.py +101 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/test/test_display_utils.py +1335 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/test/test_ref_utils.py +603 -0
- unique_toolkit/agentic/tools/a2a/prompts.py +46 -0
- unique_toolkit/agentic/tools/a2a/response_watcher/__init__.py +6 -0
- unique_toolkit/agentic/tools/a2a/response_watcher/service.py +91 -0
- unique_toolkit/agentic/tools/a2a/tool/config.py +15 -5
- unique_toolkit/agentic/tools/a2a/tool/service.py +69 -36
- unique_toolkit/agentic/tools/config.py +16 -2
- unique_toolkit/agentic/tools/factory.py +4 -0
- unique_toolkit/agentic/tools/mcp/tool_wrapper.py +7 -35
- unique_toolkit/agentic/tools/openai_builtin/__init__.py +11 -0
- unique_toolkit/agentic/tools/openai_builtin/base.py +30 -0
- unique_toolkit/agentic/tools/openai_builtin/code_interpreter/__init__.py +8 -0
- unique_toolkit/agentic/tools/openai_builtin/code_interpreter/config.py +57 -0
- unique_toolkit/agentic/tools/openai_builtin/code_interpreter/service.py +230 -0
- unique_toolkit/agentic/tools/openai_builtin/manager.py +62 -0
- unique_toolkit/agentic/tools/test/test_mcp_manager.py +95 -7
- unique_toolkit/agentic/tools/test/test_tool_progress_reporter.py +240 -0
- unique_toolkit/agentic/tools/tool.py +0 -11
- unique_toolkit/agentic/tools/tool_manager.py +337 -122
- unique_toolkit/agentic/tools/tool_progress_reporter.py +81 -15
- unique_toolkit/agentic/tools/utils/__init__.py +18 -0
- unique_toolkit/agentic/tools/utils/execution/execution.py +8 -4
- unique_toolkit/agentic/tools/utils/source_handling/schema.py +1 -1
- unique_toolkit/chat/__init__.py +8 -1
- unique_toolkit/chat/deprecated/service.py +232 -0
- unique_toolkit/chat/functions.py +54 -40
- unique_toolkit/chat/rendering.py +34 -0
- unique_toolkit/chat/responses_api.py +461 -0
- unique_toolkit/chat/schemas.py +1 -1
- unique_toolkit/chat/service.py +96 -1569
- unique_toolkit/content/functions.py +116 -1
- unique_toolkit/content/schemas.py +59 -0
- unique_toolkit/content/service.py +5 -37
- unique_toolkit/content/smart_rules.py +301 -0
- unique_toolkit/framework_utilities/langchain/client.py +27 -3
- unique_toolkit/framework_utilities/openai/client.py +12 -1
- unique_toolkit/framework_utilities/openai/message_builder.py +85 -1
- unique_toolkit/language_model/default_language_model.py +3 -0
- unique_toolkit/language_model/functions.py +25 -9
- unique_toolkit/language_model/infos.py +72 -4
- unique_toolkit/language_model/schemas.py +246 -40
- unique_toolkit/protocols/support.py +91 -9
- unique_toolkit/services/__init__.py +7 -0
- unique_toolkit/services/chat_service.py +1630 -0
- unique_toolkit/services/knowledge_base.py +861 -0
- unique_toolkit/smart_rules/compile.py +56 -301
- unique_toolkit/test_utilities/events.py +197 -0
- {unique_toolkit-1.8.1.dist-info → unique_toolkit-1.23.0.dist-info}/METADATA +173 -3
- {unique_toolkit-1.8.1.dist-info → unique_toolkit-1.23.0.dist-info}/RECORD +99 -67
- unique_toolkit/agentic/tools/a2a/postprocessing/_display.py +0 -122
- unique_toolkit/agentic/tools/a2a/postprocessing/_utils.py +0 -19
- unique_toolkit/agentic/tools/a2a/postprocessing/postprocessor.py +0 -230
- unique_toolkit/agentic/tools/a2a/postprocessing/test/test_consolidate_references.py +0 -665
- unique_toolkit/agentic/tools/a2a/postprocessing/test/test_display.py +0 -391
- unique_toolkit/agentic/tools/a2a/postprocessing/test/test_postprocessor_reference_functions.py +0 -256
- {unique_toolkit-1.8.1.dist-info → unique_toolkit-1.23.0.dist-info}/LICENSE +0 -0
- {unique_toolkit-1.8.1.dist-info → unique_toolkit-1.23.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test suite for DebugInfoManager class.
|
|
3
|
+
|
|
4
|
+
This test suite validates the DebugInfoManager's ability to:
|
|
5
|
+
1. Initialize with empty debug info
|
|
6
|
+
2. Extract tool debug info from ToolCallResponse objects
|
|
7
|
+
3. Handle loop iteration indices
|
|
8
|
+
4. Add arbitrary key-value pairs to debug info
|
|
9
|
+
5. Retrieve the complete debug info dictionary
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from unique_toolkit.agentic.debug_info_manager.debug_info_manager import (
|
|
13
|
+
DebugInfoManager,
|
|
14
|
+
)
|
|
15
|
+
from unique_toolkit.agentic.tools.schemas import ToolCallResponse
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TestDebugInfoManager:
|
|
19
|
+
"""Test suite for DebugInfoManager functionality."""
|
|
20
|
+
|
|
21
|
+
def test_init__initializes_empty_debug_info__on_creation(self):
|
|
22
|
+
"""Test that DebugInfoManager initializes with empty tools list."""
|
|
23
|
+
manager = DebugInfoManager()
|
|
24
|
+
|
|
25
|
+
assert manager.debug_info == {"tools": []}
|
|
26
|
+
assert manager.get() == {"tools": []}
|
|
27
|
+
|
|
28
|
+
def test_extract_tool_debug_info__adds_single_tool__with_valid_response(self):
|
|
29
|
+
"""Test extracting debug info from a single ToolCallResponse."""
|
|
30
|
+
manager = DebugInfoManager()
|
|
31
|
+
tool_call_response = ToolCallResponse(
|
|
32
|
+
id="tool_1",
|
|
33
|
+
name="TestTool",
|
|
34
|
+
debug_info={"execution_time": "100ms", "status": "success"},
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
manager.extract_tool_debug_info([tool_call_response])
|
|
38
|
+
|
|
39
|
+
debug_info = manager.get()
|
|
40
|
+
assert len(debug_info["tools"]) == 1
|
|
41
|
+
assert debug_info["tools"][0]["name"] == "TestTool"
|
|
42
|
+
assert debug_info["tools"][0]["info"]["execution_time"] == "100ms"
|
|
43
|
+
assert debug_info["tools"][0]["info"]["status"] == "success"
|
|
44
|
+
|
|
45
|
+
def test_extract_tool_debug_info__adds_multiple_tools__with_multiple_responses(
|
|
46
|
+
self,
|
|
47
|
+
):
|
|
48
|
+
"""Test extracting debug info from multiple ToolCallResponse objects."""
|
|
49
|
+
manager = DebugInfoManager()
|
|
50
|
+
tool_call_responses = [
|
|
51
|
+
ToolCallResponse(
|
|
52
|
+
id="tool_1",
|
|
53
|
+
name="SearchTool",
|
|
54
|
+
debug_info={"query": "test query", "results": 5},
|
|
55
|
+
),
|
|
56
|
+
ToolCallResponse(
|
|
57
|
+
id="tool_2",
|
|
58
|
+
name="CalculatorTool",
|
|
59
|
+
debug_info={"operation": "add", "result": 42},
|
|
60
|
+
),
|
|
61
|
+
ToolCallResponse(
|
|
62
|
+
id="tool_3",
|
|
63
|
+
name="WeatherTool",
|
|
64
|
+
debug_info={"location": "New York", "temperature": "72F"},
|
|
65
|
+
),
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
manager.extract_tool_debug_info(tool_call_responses)
|
|
69
|
+
|
|
70
|
+
debug_info = manager.get()
|
|
71
|
+
assert len(debug_info["tools"]) == 3
|
|
72
|
+
assert debug_info["tools"][0]["name"] == "SearchTool"
|
|
73
|
+
assert debug_info["tools"][1]["name"] == "CalculatorTool"
|
|
74
|
+
assert debug_info["tools"][2]["name"] == "WeatherTool"
|
|
75
|
+
|
|
76
|
+
def test_extract_tool_debug_info__preserves_order__with_sequential_calls(self):
|
|
77
|
+
"""Test that multiple calls to extract_tool_debug_info preserve order."""
|
|
78
|
+
manager = DebugInfoManager()
|
|
79
|
+
|
|
80
|
+
# First call
|
|
81
|
+
manager.extract_tool_debug_info(
|
|
82
|
+
[ToolCallResponse(id="tool_1", name="Tool1", debug_info={"step": 1})]
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Second call
|
|
86
|
+
manager.extract_tool_debug_info(
|
|
87
|
+
[ToolCallResponse(id="tool_2", name="Tool2", debug_info={"step": 2})]
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Third call
|
|
91
|
+
manager.extract_tool_debug_info(
|
|
92
|
+
[ToolCallResponse(id="tool_3", name="Tool3", debug_info={"step": 3})]
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
debug_info = manager.get()
|
|
96
|
+
assert len(debug_info["tools"]) == 3
|
|
97
|
+
assert debug_info["tools"][0]["info"]["step"] == 1
|
|
98
|
+
assert debug_info["tools"][1]["info"]["step"] == 2
|
|
99
|
+
assert debug_info["tools"][2]["info"]["step"] == 3
|
|
100
|
+
|
|
101
|
+
def test_extract_tool_debug_info__adds_loop_iteration__when_index_provided(self):
|
|
102
|
+
"""Test that loop_iteration_index is added to debug info when provided."""
|
|
103
|
+
manager = DebugInfoManager()
|
|
104
|
+
tool_call_response = ToolCallResponse(
|
|
105
|
+
id="tool_1", name="IterativeTool", debug_info={"status": "processing"}
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
manager.extract_tool_debug_info([tool_call_response], loop_iteration_index=3)
|
|
109
|
+
|
|
110
|
+
debug_info = manager.get()
|
|
111
|
+
assert debug_info["tools"][0]["info"]["loop_iteration"] == 3
|
|
112
|
+
assert debug_info["tools"][0]["info"]["status"] == "processing"
|
|
113
|
+
|
|
114
|
+
def test_extract_tool_debug_info__omits_loop_iteration__when_index_is_none(self):
|
|
115
|
+
"""Test that loop_iteration is not added when index is None."""
|
|
116
|
+
manager = DebugInfoManager()
|
|
117
|
+
tool_call_response = ToolCallResponse(
|
|
118
|
+
id="tool_1", name="SingleRunTool", debug_info={"status": "complete"}
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
manager.extract_tool_debug_info([tool_call_response], loop_iteration_index=None)
|
|
122
|
+
|
|
123
|
+
debug_info = manager.get()
|
|
124
|
+
assert "loop_iteration" not in debug_info["tools"][0]["info"]
|
|
125
|
+
assert debug_info["tools"][0]["info"]["status"] == "complete"
|
|
126
|
+
|
|
127
|
+
def test_extract_tool_debug_info__handles_empty_debug_info__gracefully(self):
|
|
128
|
+
"""Test extracting from ToolCallResponse with empty debug_info dict."""
|
|
129
|
+
manager = DebugInfoManager()
|
|
130
|
+
tool_call_response = ToolCallResponse(
|
|
131
|
+
id="tool_1", name="MinimalTool", debug_info={}
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
manager.extract_tool_debug_info([tool_call_response])
|
|
135
|
+
|
|
136
|
+
debug_info = manager.get()
|
|
137
|
+
assert len(debug_info["tools"]) == 1
|
|
138
|
+
assert debug_info["tools"][0]["name"] == "MinimalTool"
|
|
139
|
+
assert debug_info["tools"][0]["info"] == {}
|
|
140
|
+
|
|
141
|
+
def test_extract_tool_debug_info__handles_empty_list__without_error(self):
|
|
142
|
+
"""Test that passing an empty list doesn't cause errors."""
|
|
143
|
+
manager = DebugInfoManager()
|
|
144
|
+
|
|
145
|
+
manager.extract_tool_debug_info([])
|
|
146
|
+
|
|
147
|
+
debug_info = manager.get()
|
|
148
|
+
assert debug_info["tools"] == []
|
|
149
|
+
|
|
150
|
+
def test_add__adds_new_key_value_pair__to_debug_info(self):
|
|
151
|
+
"""Test adding a new key-value pair to debug_info."""
|
|
152
|
+
manager = DebugInfoManager()
|
|
153
|
+
|
|
154
|
+
manager.add("execution_summary", {"total_time": "500ms", "total_calls": 5})
|
|
155
|
+
|
|
156
|
+
debug_info = manager.get()
|
|
157
|
+
assert "execution_summary" in debug_info
|
|
158
|
+
assert debug_info["execution_summary"]["total_time"] == "500ms"
|
|
159
|
+
assert debug_info["execution_summary"]["total_calls"] == 5
|
|
160
|
+
|
|
161
|
+
def test_add__preserves_tools_list__when_adding_new_keys(self):
|
|
162
|
+
"""Test that add() preserves the tools list."""
|
|
163
|
+
manager = DebugInfoManager()
|
|
164
|
+
manager.extract_tool_debug_info(
|
|
165
|
+
[
|
|
166
|
+
ToolCallResponse(
|
|
167
|
+
id="tool_1", name="TestTool", debug_info={"test": "data"}
|
|
168
|
+
)
|
|
169
|
+
]
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
manager.add("metadata", {"version": "1.0"})
|
|
173
|
+
|
|
174
|
+
debug_info = manager.get()
|
|
175
|
+
assert len(debug_info["tools"]) == 1
|
|
176
|
+
assert debug_info["tools"][0]["name"] == "TestTool"
|
|
177
|
+
assert debug_info["metadata"]["version"] == "1.0"
|
|
178
|
+
|
|
179
|
+
def test_add__overwrites_existing_key__when_key_exists(self):
|
|
180
|
+
"""Test that add() overwrites an existing key."""
|
|
181
|
+
manager = DebugInfoManager()
|
|
182
|
+
manager.add("status", "in_progress")
|
|
183
|
+
manager.add("status", "completed")
|
|
184
|
+
|
|
185
|
+
debug_info = manager.get()
|
|
186
|
+
assert debug_info["status"] == "completed"
|
|
187
|
+
|
|
188
|
+
def test_add__adds_multiple_keys__with_sequential_calls(self):
|
|
189
|
+
"""Test adding multiple key-value pairs with sequential calls."""
|
|
190
|
+
manager = DebugInfoManager()
|
|
191
|
+
|
|
192
|
+
manager.add("key1", "value1")
|
|
193
|
+
manager.add("key2", {"nested": "value2"})
|
|
194
|
+
manager.add("key3", [1, 2, 3])
|
|
195
|
+
|
|
196
|
+
debug_info = manager.get()
|
|
197
|
+
assert debug_info["key1"] == "value1"
|
|
198
|
+
assert debug_info["key2"]["nested"] == "value2"
|
|
199
|
+
assert debug_info["key3"] == [1, 2, 3]
|
|
200
|
+
|
|
201
|
+
def test_get__returns_complete_debug_info__with_mixed_data(self):
|
|
202
|
+
"""Test get() returns complete debug info with tools and custom keys."""
|
|
203
|
+
manager = DebugInfoManager()
|
|
204
|
+
|
|
205
|
+
# Add tool debug info
|
|
206
|
+
manager.extract_tool_debug_info(
|
|
207
|
+
[ToolCallResponse(id="tool_1", name="Tool1", debug_info={"data": "test"})],
|
|
208
|
+
loop_iteration_index=0,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Add custom keys
|
|
212
|
+
manager.add("start_time", "2025-10-16T10:00:00")
|
|
213
|
+
manager.add("end_time", "2025-10-16T10:01:00")
|
|
214
|
+
|
|
215
|
+
debug_info = manager.get()
|
|
216
|
+
|
|
217
|
+
assert "tools" in debug_info
|
|
218
|
+
assert "start_time" in debug_info
|
|
219
|
+
assert "end_time" in debug_info
|
|
220
|
+
assert len(debug_info["tools"]) == 1
|
|
221
|
+
assert debug_info["start_time"] == "2025-10-16T10:00:00"
|
|
222
|
+
|
|
223
|
+
def test_integration__complete_workflow__with_all_operations(self):
|
|
224
|
+
"""Integration test: complete workflow using all DebugInfoManager methods."""
|
|
225
|
+
manager = DebugInfoManager()
|
|
226
|
+
|
|
227
|
+
# Initial state
|
|
228
|
+
assert manager.get() == {"tools": []}
|
|
229
|
+
|
|
230
|
+
# Add some metadata
|
|
231
|
+
manager.add("session_id", "abc-123")
|
|
232
|
+
manager.add("user_id", "user-456")
|
|
233
|
+
|
|
234
|
+
# First tool call (loop iteration 0)
|
|
235
|
+
manager.extract_tool_debug_info(
|
|
236
|
+
[
|
|
237
|
+
ToolCallResponse(
|
|
238
|
+
id="tool_1",
|
|
239
|
+
name="SearchTool",
|
|
240
|
+
debug_info={"query": "AI research", "hits": 100},
|
|
241
|
+
)
|
|
242
|
+
],
|
|
243
|
+
loop_iteration_index=0,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Second tool call (loop iteration 1)
|
|
247
|
+
manager.extract_tool_debug_info(
|
|
248
|
+
[
|
|
249
|
+
ToolCallResponse(
|
|
250
|
+
id="tool_2",
|
|
251
|
+
name="AnalysisTool",
|
|
252
|
+
debug_info={"processed": 50, "relevant": 10},
|
|
253
|
+
),
|
|
254
|
+
ToolCallResponse(
|
|
255
|
+
id="tool_3",
|
|
256
|
+
name="SummaryTool",
|
|
257
|
+
debug_info={"paragraphs": 3, "words": 250},
|
|
258
|
+
),
|
|
259
|
+
],
|
|
260
|
+
loop_iteration_index=1,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# Add final summary
|
|
264
|
+
manager.add("summary", {"total_tools": 3, "total_iterations": 2})
|
|
265
|
+
|
|
266
|
+
# Verify complete debug info
|
|
267
|
+
debug_info = manager.get()
|
|
268
|
+
|
|
269
|
+
assert debug_info["session_id"] == "abc-123"
|
|
270
|
+
assert debug_info["user_id"] == "user-456"
|
|
271
|
+
assert len(debug_info["tools"]) == 3
|
|
272
|
+
assert debug_info["tools"][0]["name"] == "SearchTool"
|
|
273
|
+
assert debug_info["tools"][0]["info"]["loop_iteration"] == 0
|
|
274
|
+
assert debug_info["tools"][1]["name"] == "AnalysisTool"
|
|
275
|
+
assert debug_info["tools"][1]["info"]["loop_iteration"] == 1
|
|
276
|
+
assert debug_info["tools"][2]["name"] == "SummaryTool"
|
|
277
|
+
assert debug_info["tools"][2]["info"]["loop_iteration"] == 1
|
|
278
|
+
assert debug_info["summary"]["total_tools"] == 3
|
|
@@ -4,7 +4,8 @@ from humps import camelize
|
|
|
4
4
|
from pydantic import BaseModel, ConfigDict, Field
|
|
5
5
|
|
|
6
6
|
from unique_toolkit._common.validators import LMI
|
|
7
|
-
from unique_toolkit.language_model.
|
|
7
|
+
from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
|
|
8
|
+
from unique_toolkit.language_model.infos import LanguageModelInfo
|
|
8
9
|
|
|
9
10
|
from .schemas import (
|
|
10
11
|
EvaluationMetricName,
|
|
@@ -24,7 +25,7 @@ class EvaluationMetricConfig(BaseModel):
|
|
|
24
25
|
enabled: bool = False
|
|
25
26
|
name: EvaluationMetricName
|
|
26
27
|
language_model: LMI = LanguageModelInfo.from_name(
|
|
27
|
-
|
|
28
|
+
DEFAULT_GPT_4o,
|
|
28
29
|
)
|
|
29
30
|
additional_llm_options: dict[str, Any] = Field(
|
|
30
31
|
default={},
|
|
@@ -4,7 +4,6 @@ from typing import overload
|
|
|
4
4
|
from pydantic import BaseModel, ValidationError
|
|
5
5
|
from typing_extensions import deprecated
|
|
6
6
|
|
|
7
|
-
from unique_toolkit._common.default_language_model import DEFAULT_GPT_35_TURBO
|
|
8
7
|
from unique_toolkit._common.validate_required_values import (
|
|
9
8
|
validate_required_values,
|
|
10
9
|
)
|
|
@@ -24,6 +23,7 @@ from unique_toolkit.agentic.evaluation.schemas import (
|
|
|
24
23
|
EvaluationMetricResult,
|
|
25
24
|
)
|
|
26
25
|
from unique_toolkit.app.schemas import BaseEvent, ChatEvent
|
|
26
|
+
from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
|
|
27
27
|
from unique_toolkit.language_model.infos import (
|
|
28
28
|
LanguageModelInfo,
|
|
29
29
|
ModelCapabilities,
|
|
@@ -49,7 +49,7 @@ USER_MSG_KEY = "userPrompt"
|
|
|
49
49
|
default_config = EvaluationMetricConfig(
|
|
50
50
|
enabled=False,
|
|
51
51
|
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
52
|
-
language_model=LanguageModelInfo.from_name(
|
|
52
|
+
language_model=LanguageModelInfo.from_name(DEFAULT_GPT_4o),
|
|
53
53
|
custom_prompts={
|
|
54
54
|
SYSTEM_MSG_KEY: CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
|
|
55
55
|
USER_MSG_KEY: CONTEXT_RELEVANCY_METRIC_USER_MSG,
|
|
@@ -123,6 +123,15 @@ class EvaluationManager:
|
|
|
123
123
|
self._evaluation_passed = False
|
|
124
124
|
evaluation_results_unpacked.append(unpacked_evaluation_result)
|
|
125
125
|
|
|
126
|
+
for evaluation_name, evaluation_result in zip(
|
|
127
|
+
selected_evaluation_names, evaluation_results_unpacked
|
|
128
|
+
):
|
|
129
|
+
evaluation_instance = self.get_evaluation_by_name(evaluation_name)
|
|
130
|
+
if evaluation_instance:
|
|
131
|
+
await self._show_message_assessment(
|
|
132
|
+
evaluation_instance, evaluation_result, assistant_message_id
|
|
133
|
+
)
|
|
134
|
+
|
|
126
135
|
return evaluation_results_unpacked
|
|
127
136
|
|
|
128
137
|
async def execute_evaluation_call(
|
|
@@ -143,11 +152,6 @@ class EvaluationManager:
|
|
|
143
152
|
evaluation_metric_result: EvaluationMetricResult = (
|
|
144
153
|
await evaluation_instance.run(loop_response)
|
|
145
154
|
)
|
|
146
|
-
# show results to the user
|
|
147
|
-
await self._show_message_assessment(
|
|
148
|
-
evaluation_instance, evaluation_metric_result, assistant_message_id
|
|
149
|
-
)
|
|
150
|
-
|
|
151
155
|
return evaluation_metric_result
|
|
152
156
|
|
|
153
157
|
return EvaluationMetricResult(
|
|
@@ -2,7 +2,6 @@ from typing import Any
|
|
|
2
2
|
|
|
3
3
|
from pydantic import Field
|
|
4
4
|
|
|
5
|
-
from unique_toolkit._common.default_language_model import DEFAULT_GPT_4o
|
|
6
5
|
from unique_toolkit._common.validators import LMI
|
|
7
6
|
from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
|
|
8
7
|
from unique_toolkit.agentic.evaluation.hallucination.prompts import (
|
|
@@ -15,6 +14,7 @@ from unique_toolkit.agentic.evaluation.schemas import (
|
|
|
15
14
|
EvaluationMetricInputFieldName,
|
|
16
15
|
EvaluationMetricName,
|
|
17
16
|
)
|
|
17
|
+
from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
|
|
18
18
|
from unique_toolkit.language_model.infos import LanguageModelInfo
|
|
19
19
|
|
|
20
20
|
SYSTEM_MSG_KEY = "systemPrompt"
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import regex as re
|
|
2
|
+
|
|
1
3
|
from unique_toolkit.agentic.evaluation.evaluation_manager import Evaluation
|
|
2
4
|
from unique_toolkit.agentic.evaluation.hallucination.constants import (
|
|
3
5
|
HallucinationConfig,
|
|
@@ -18,6 +20,7 @@ from unique_toolkit.chat.schemas import (
|
|
|
18
20
|
ChatMessageAssessmentStatus,
|
|
19
21
|
ChatMessageAssessmentType,
|
|
20
22
|
)
|
|
23
|
+
from unique_toolkit.language_model.reference import _preprocess_message
|
|
21
24
|
from unique_toolkit.language_model.schemas import (
|
|
22
25
|
LanguageModelStreamResponse,
|
|
23
26
|
)
|
|
@@ -40,13 +43,25 @@ class HallucinationEvaluation(Evaluation):
|
|
|
40
43
|
async def run(
|
|
41
44
|
self, loop_response: LanguageModelStreamResponse
|
|
42
45
|
) -> EvaluationMetricResult: # type: ignore
|
|
43
|
-
|
|
46
|
+
all_chunks = self._reference_manager.get_chunks()
|
|
47
|
+
|
|
48
|
+
# source numbers from original text
|
|
49
|
+
ref_pattern = r"\[(\d+)\]"
|
|
50
|
+
original_text = loop_response.message.original_text
|
|
51
|
+
|
|
52
|
+
# preprocess original text to deal with different source patterns
|
|
53
|
+
original_text_preprocessed = _preprocess_message(original_text)
|
|
54
|
+
|
|
55
|
+
source_number_matches = re.findall(ref_pattern, original_text_preprocessed)
|
|
56
|
+
source_numbers = {int(num) for num in source_number_matches}
|
|
57
|
+
|
|
58
|
+
referenced_chunks = [all_chunks[idx] for idx in source_numbers]
|
|
44
59
|
|
|
45
60
|
evaluation_result: EvaluationMetricResult = await check_hallucination(
|
|
46
61
|
company_id=self._company_id,
|
|
47
62
|
input=EvaluationMetricInput(
|
|
48
63
|
input_text=self._user_message,
|
|
49
|
-
context_texts=[context.text for context in
|
|
64
|
+
context_texts=[context.text for context in referenced_chunks],
|
|
50
65
|
history_messages=[], # TODO include loop_history messages
|
|
51
66
|
output_text=loop_response.message.text,
|
|
52
67
|
),
|
|
@@ -78,11 +93,19 @@ class HallucinationEvaluation(Evaluation):
|
|
|
78
93
|
if not evaluation_result.error
|
|
79
94
|
else ChatMessageAssessmentStatus.ERROR
|
|
80
95
|
)
|
|
96
|
+
explanation = evaluation_result.reason
|
|
97
|
+
|
|
98
|
+
if status == ChatMessageAssessmentStatus.ERROR:
|
|
99
|
+
title = "Hallucination Check Error"
|
|
100
|
+
label = ChatMessageAssessmentLabel.RED
|
|
101
|
+
explanation = (
|
|
102
|
+
"An unrecoverable error occurred while evaluating the response."
|
|
103
|
+
)
|
|
81
104
|
|
|
82
105
|
return EvaluationAssessmentMessage(
|
|
83
106
|
status=status,
|
|
84
107
|
title=title,
|
|
85
|
-
explanation=
|
|
108
|
+
explanation=explanation,
|
|
86
109
|
label=label,
|
|
87
110
|
type=self.get_assessment_type(),
|
|
88
111
|
)
|
|
@@ -3,6 +3,9 @@ from typing import Annotated, Awaitable, Callable
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel, Field
|
|
5
5
|
|
|
6
|
+
from unique_toolkit._common.feature_flags.schema import (
|
|
7
|
+
FeatureExtendedSourceSerialization,
|
|
8
|
+
)
|
|
6
9
|
from unique_toolkit._common.validators import LMI
|
|
7
10
|
from unique_toolkit.agentic.history_manager.loop_token_reducer import LoopTokenReducer
|
|
8
11
|
from unique_toolkit.agentic.history_manager.utils import transform_chunks_to_string
|
|
@@ -10,7 +13,8 @@ from unique_toolkit.agentic.reference_manager.reference_manager import Reference
|
|
|
10
13
|
from unique_toolkit.agentic.tools.config import get_configuration_dict
|
|
11
14
|
from unique_toolkit.agentic.tools.schemas import ToolCallResponse
|
|
12
15
|
from unique_toolkit.app.schemas import ChatEvent
|
|
13
|
-
from unique_toolkit.language_model.
|
|
16
|
+
from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
|
|
17
|
+
from unique_toolkit.language_model.infos import LanguageModelInfo
|
|
14
18
|
from unique_toolkit.language_model.schemas import (
|
|
15
19
|
LanguageModelAssistantMessage,
|
|
16
20
|
LanguageModelFunction,
|
|
@@ -41,11 +45,7 @@ class UploadedContentConfig(BaseModel):
|
|
|
41
45
|
)
|
|
42
46
|
|
|
43
47
|
|
|
44
|
-
class ExperimentalFeatures(
|
|
45
|
-
full_sources_serialize_dump: bool = Field(
|
|
46
|
-
default=False,
|
|
47
|
-
description="If True, the sources will be serialized in full, otherwise only the content will be serialized.",
|
|
48
|
-
)
|
|
48
|
+
class ExperimentalFeatures(FeatureExtendedSourceSerialization): ...
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
class HistoryManagerConfig(BaseModel):
|
|
@@ -61,9 +61,7 @@ class HistoryManagerConfig(BaseModel):
|
|
|
61
61
|
description="The fraction of the max input tokens that will be reserved for the history.",
|
|
62
62
|
)
|
|
63
63
|
|
|
64
|
-
language_model: LMI = LanguageModelInfo.from_name(
|
|
65
|
-
LanguageModelName.AZURE_GPT_4o_2024_1120
|
|
66
|
-
)
|
|
64
|
+
language_model: LMI = LanguageModelInfo.from_name(DEFAULT_GPT_4o)
|
|
67
65
|
|
|
68
66
|
@property
|
|
69
67
|
def max_history_tokens(self) -> int:
|
|
@@ -122,9 +120,16 @@ class HistoryManager:
|
|
|
122
120
|
reference_manager=reference_manager,
|
|
123
121
|
)
|
|
124
122
|
self._tool_call_result_history: list[ToolCallResponse] = []
|
|
123
|
+
self._tool_calls: list[LanguageModelFunction] = []
|
|
125
124
|
self._loop_history: list[LanguageModelMessage] = []
|
|
126
125
|
self._source_enumerator = 0
|
|
127
126
|
|
|
127
|
+
def add_tool_call(self, tool_call: LanguageModelFunction) -> None:
|
|
128
|
+
self._tool_calls.append(tool_call)
|
|
129
|
+
|
|
130
|
+
def get_tool_calls(self) -> list[LanguageModelFunction]:
|
|
131
|
+
return self._tool_calls
|
|
132
|
+
|
|
128
133
|
def has_no_loop_messages(self) -> bool:
|
|
129
134
|
return len(self._loop_history) == 0
|
|
130
135
|
|
|
@@ -173,8 +178,6 @@ class HistoryManager:
|
|
|
173
178
|
stringified_sources, sources = transform_chunks_to_string(
|
|
174
179
|
content_chunks,
|
|
175
180
|
self._source_enumerator,
|
|
176
|
-
None, # Use None for SourceFormatConfig
|
|
177
|
-
self._config.experimental_features.full_sources_serialize_dump,
|
|
178
181
|
)
|
|
179
182
|
|
|
180
183
|
self._source_enumerator += len(
|
|
@@ -266,7 +266,9 @@ class LoopTokenReducer:
|
|
|
266
266
|
selected_messages = []
|
|
267
267
|
token_count = 0
|
|
268
268
|
for msg in messages[::-1]:
|
|
269
|
-
msg_token_count = self.
|
|
269
|
+
msg_token_count = self._count_message_tokens(
|
|
270
|
+
LanguageModelMessages(root=[msg])
|
|
271
|
+
)
|
|
270
272
|
if token_count + msg_token_count > token_limit:
|
|
271
273
|
break
|
|
272
274
|
selected_messages.append(msg)
|
|
@@ -293,9 +295,6 @@ class LoopTokenReducer:
|
|
|
293
295
|
)
|
|
294
296
|
return messages
|
|
295
297
|
|
|
296
|
-
def _count_tokens(self, text: str) -> int:
|
|
297
|
-
return len(self._encoder.encode(text))
|
|
298
|
-
|
|
299
298
|
def ensure_last_message_is_user_message(self, limited_history_messages):
|
|
300
299
|
"""
|
|
301
300
|
As the token limit can be reached in the middle of a gpt_request,
|
|
@@ -1,12 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
from copy import deepcopy
|
|
4
|
+
from typing import Any
|
|
4
5
|
|
|
5
6
|
from unique_toolkit.agentic.tools.schemas import Source
|
|
6
|
-
from unique_toolkit.
|
|
7
|
-
SourceFormatConfig,
|
|
8
|
-
)
|
|
9
|
-
from unique_toolkit.content.schemas import ContentChunk, ContentMetadata
|
|
7
|
+
from unique_toolkit.content.schemas import ContentChunk
|
|
10
8
|
from unique_toolkit.language_model.schemas import (
|
|
11
9
|
LanguageModelAssistantMessage,
|
|
12
10
|
LanguageModelMessage,
|
|
@@ -62,58 +60,25 @@ def _convert_tool_call_response_to_content(
|
|
|
62
60
|
def transform_chunks_to_string(
|
|
63
61
|
content_chunks: list[ContentChunk],
|
|
64
62
|
max_source_number: int,
|
|
65
|
-
|
|
66
|
-
full_sources_serialize_dump: bool = False,
|
|
67
|
-
) -> tuple[str, list[Source]]:
|
|
63
|
+
) -> tuple[str, list[dict[str, Any]]]:
|
|
68
64
|
"""Transform content chunks into a string of sources.
|
|
69
65
|
|
|
70
66
|
Args:
|
|
71
67
|
content_chunks (list[ContentChunk]): The content chunks to transform
|
|
72
68
|
max_source_number (int): The maximum source number to use
|
|
73
|
-
feature_full_sources (bool, optional): Whether to include the full source object. Defaults to False which is the old format.
|
|
74
69
|
|
|
75
70
|
Returns:
|
|
76
71
|
str: String for the tool call response
|
|
77
72
|
"""
|
|
78
73
|
if not content_chunks:
|
|
79
74
|
return "No relevant sources found.", []
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
content=chunk.text,
|
|
88
|
-
chunk_id=chunk.chunk_id,
|
|
89
|
-
metadata=(
|
|
90
|
-
_format_metadata(chunk.metadata, cfg) or None
|
|
91
|
-
if chunk.metadata
|
|
92
|
-
else None
|
|
93
|
-
),
|
|
94
|
-
url=chunk.url,
|
|
95
|
-
).model_dump(
|
|
96
|
-
exclude_none=True,
|
|
97
|
-
exclude_defaults=True,
|
|
98
|
-
by_alias=True,
|
|
99
|
-
)
|
|
100
|
-
for i, chunk in enumerate(content_chunks)
|
|
101
|
-
]
|
|
102
|
-
else:
|
|
103
|
-
sources = [
|
|
104
|
-
{
|
|
105
|
-
"source_number": max_source_number + i,
|
|
106
|
-
"content": chunk.text,
|
|
107
|
-
**(
|
|
108
|
-
{"metadata": meta}
|
|
109
|
-
if (
|
|
110
|
-
meta := _format_metadata(chunk.metadata, cfg)
|
|
111
|
-
) # only add when not empty
|
|
112
|
-
else {}
|
|
113
|
-
),
|
|
114
|
-
}
|
|
115
|
-
for i, chunk in enumerate(content_chunks)
|
|
116
|
-
]
|
|
75
|
+
sources: list[dict[str, Any]] = [
|
|
76
|
+
{
|
|
77
|
+
"source_number": max_source_number + i,
|
|
78
|
+
"content": chunk.text,
|
|
79
|
+
}
|
|
80
|
+
for i, chunk in enumerate(content_chunks)
|
|
81
|
+
]
|
|
117
82
|
return json.dumps(sources), sources
|
|
118
83
|
|
|
119
84
|
|
|
@@ -129,45 +94,3 @@ def load_sources_from_string(
|
|
|
129
94
|
except (json.JSONDecodeError, ValueError):
|
|
130
95
|
logger.warning("Failed to parse source string")
|
|
131
96
|
return None
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
def _format_metadata(
|
|
135
|
-
metadata: ContentMetadata | None,
|
|
136
|
-
cfg: SourceFormatConfig | None,
|
|
137
|
-
) -> str:
|
|
138
|
-
"""
|
|
139
|
-
Build the concatenated tag string from the chunk's metadata
|
|
140
|
-
and the templates found in cfg.sections.
|
|
141
|
-
Example result:
|
|
142
|
-
"<|topic|>GenAI<|/topic|>\n<|date|>This document is from: 2025-04-29<|/date|>\n"
|
|
143
|
-
"""
|
|
144
|
-
if metadata is None:
|
|
145
|
-
return ""
|
|
146
|
-
|
|
147
|
-
if cfg is None or not cfg.sections:
|
|
148
|
-
# If no configuration is provided, return empty string
|
|
149
|
-
return ""
|
|
150
|
-
|
|
151
|
-
meta_dict = metadata.model_dump(exclude_none=True, by_alias=True)
|
|
152
|
-
sections = cfg.sections
|
|
153
|
-
|
|
154
|
-
parts: list[str] = []
|
|
155
|
-
for key, template in sections.items():
|
|
156
|
-
if key in meta_dict:
|
|
157
|
-
parts.append(template.format(meta_dict[key]))
|
|
158
|
-
|
|
159
|
-
return "".join(parts)
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
### In case we do not want any formatting of metadata we could use this function
|
|
163
|
-
# def _filtered_metadata(
|
|
164
|
-
# meta: ContentMetadata | None,
|
|
165
|
-
# cfg: SourceFormatConfig,
|
|
166
|
-
# ) -> dict[str, str] | None:
|
|
167
|
-
# if meta is None:
|
|
168
|
-
# return None
|
|
169
|
-
|
|
170
|
-
# allowed = set(cfg.sections)
|
|
171
|
-
# raw = meta.model_dump(exclude_none=True, by_alias=True)
|
|
172
|
-
# pruned = {k: v for k, v in raw.items() if k in allowed}
|
|
173
|
-
# return pruned or None
|