unique_toolkit 1.8.1__py3-none-any.whl → 1.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unique_toolkit might be problematic. Click here for more details.

Files changed (105) hide show
  1. unique_toolkit/__init__.py +20 -0
  2. unique_toolkit/_common/api_calling/human_verification_manager.py +121 -28
  3. unique_toolkit/_common/chunk_relevancy_sorter/config.py +3 -3
  4. unique_toolkit/_common/chunk_relevancy_sorter/tests/test_service.py +2 -5
  5. unique_toolkit/_common/default_language_model.py +9 -3
  6. unique_toolkit/_common/docx_generator/__init__.py +7 -0
  7. unique_toolkit/_common/docx_generator/config.py +12 -0
  8. unique_toolkit/_common/docx_generator/schemas.py +80 -0
  9. unique_toolkit/_common/docx_generator/service.py +252 -0
  10. unique_toolkit/_common/docx_generator/template/Doc Template.docx +0 -0
  11. unique_toolkit/_common/endpoint_builder.py +138 -117
  12. unique_toolkit/_common/endpoint_requestor.py +240 -14
  13. unique_toolkit/_common/exception.py +20 -0
  14. unique_toolkit/_common/feature_flags/schema.py +1 -5
  15. unique_toolkit/_common/referencing.py +53 -0
  16. unique_toolkit/_common/string_utilities.py +52 -1
  17. unique_toolkit/_common/tests/test_referencing.py +521 -0
  18. unique_toolkit/_common/tests/test_string_utilities.py +506 -0
  19. unique_toolkit/_common/utils/files.py +43 -0
  20. unique_toolkit/agentic/debug_info_manager/debug_info_manager.py +16 -6
  21. unique_toolkit/agentic/debug_info_manager/test/test_debug_info_manager.py +278 -0
  22. unique_toolkit/agentic/evaluation/config.py +3 -2
  23. unique_toolkit/agentic/evaluation/context_relevancy/service.py +2 -2
  24. unique_toolkit/agentic/evaluation/evaluation_manager.py +9 -5
  25. unique_toolkit/agentic/evaluation/hallucination/constants.py +1 -1
  26. unique_toolkit/agentic/evaluation/hallucination/hallucination_evaluation.py +26 -3
  27. unique_toolkit/agentic/history_manager/history_manager.py +14 -11
  28. unique_toolkit/agentic/history_manager/loop_token_reducer.py +3 -4
  29. unique_toolkit/agentic/history_manager/utils.py +10 -87
  30. unique_toolkit/agentic/postprocessor/postprocessor_manager.py +107 -16
  31. unique_toolkit/agentic/reference_manager/reference_manager.py +1 -1
  32. unique_toolkit/agentic/responses_api/__init__.py +19 -0
  33. unique_toolkit/agentic/responses_api/postprocessors/code_display.py +63 -0
  34. unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +145 -0
  35. unique_toolkit/agentic/responses_api/stream_handler.py +15 -0
  36. unique_toolkit/agentic/tools/a2a/__init__.py +18 -2
  37. unique_toolkit/agentic/tools/a2a/evaluation/__init__.py +2 -0
  38. unique_toolkit/agentic/tools/a2a/evaluation/_utils.py +3 -3
  39. unique_toolkit/agentic/tools/a2a/evaluation/config.py +1 -1
  40. unique_toolkit/agentic/tools/a2a/evaluation/evaluator.py +143 -91
  41. unique_toolkit/agentic/tools/a2a/manager.py +7 -1
  42. unique_toolkit/agentic/tools/a2a/postprocessing/__init__.py +11 -3
  43. unique_toolkit/agentic/tools/a2a/postprocessing/_display_utils.py +185 -0
  44. unique_toolkit/agentic/tools/a2a/postprocessing/_ref_utils.py +73 -0
  45. unique_toolkit/agentic/tools/a2a/postprocessing/config.py +21 -0
  46. unique_toolkit/agentic/tools/a2a/postprocessing/display.py +180 -0
  47. unique_toolkit/agentic/tools/a2a/postprocessing/references.py +101 -0
  48. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_display_utils.py +1335 -0
  49. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_ref_utils.py +603 -0
  50. unique_toolkit/agentic/tools/a2a/prompts.py +46 -0
  51. unique_toolkit/agentic/tools/a2a/response_watcher/__init__.py +6 -0
  52. unique_toolkit/agentic/tools/a2a/response_watcher/service.py +91 -0
  53. unique_toolkit/agentic/tools/a2a/tool/config.py +15 -5
  54. unique_toolkit/agentic/tools/a2a/tool/service.py +69 -36
  55. unique_toolkit/agentic/tools/config.py +16 -2
  56. unique_toolkit/agentic/tools/factory.py +4 -0
  57. unique_toolkit/agentic/tools/mcp/tool_wrapper.py +7 -35
  58. unique_toolkit/agentic/tools/openai_builtin/__init__.py +11 -0
  59. unique_toolkit/agentic/tools/openai_builtin/base.py +30 -0
  60. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/__init__.py +8 -0
  61. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/config.py +57 -0
  62. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/service.py +230 -0
  63. unique_toolkit/agentic/tools/openai_builtin/manager.py +62 -0
  64. unique_toolkit/agentic/tools/test/test_mcp_manager.py +95 -7
  65. unique_toolkit/agentic/tools/test/test_tool_progress_reporter.py +240 -0
  66. unique_toolkit/agentic/tools/tool.py +0 -11
  67. unique_toolkit/agentic/tools/tool_manager.py +337 -122
  68. unique_toolkit/agentic/tools/tool_progress_reporter.py +81 -15
  69. unique_toolkit/agentic/tools/utils/__init__.py +18 -0
  70. unique_toolkit/agentic/tools/utils/execution/execution.py +8 -4
  71. unique_toolkit/agentic/tools/utils/source_handling/schema.py +1 -1
  72. unique_toolkit/chat/__init__.py +8 -1
  73. unique_toolkit/chat/deprecated/service.py +232 -0
  74. unique_toolkit/chat/functions.py +54 -40
  75. unique_toolkit/chat/rendering.py +34 -0
  76. unique_toolkit/chat/responses_api.py +461 -0
  77. unique_toolkit/chat/schemas.py +1 -1
  78. unique_toolkit/chat/service.py +96 -1569
  79. unique_toolkit/content/functions.py +116 -1
  80. unique_toolkit/content/schemas.py +59 -0
  81. unique_toolkit/content/service.py +5 -37
  82. unique_toolkit/content/smart_rules.py +301 -0
  83. unique_toolkit/framework_utilities/langchain/client.py +27 -3
  84. unique_toolkit/framework_utilities/openai/client.py +12 -1
  85. unique_toolkit/framework_utilities/openai/message_builder.py +85 -1
  86. unique_toolkit/language_model/default_language_model.py +3 -0
  87. unique_toolkit/language_model/functions.py +25 -9
  88. unique_toolkit/language_model/infos.py +72 -4
  89. unique_toolkit/language_model/schemas.py +246 -40
  90. unique_toolkit/protocols/support.py +91 -9
  91. unique_toolkit/services/__init__.py +7 -0
  92. unique_toolkit/services/chat_service.py +1630 -0
  93. unique_toolkit/services/knowledge_base.py +861 -0
  94. unique_toolkit/smart_rules/compile.py +56 -301
  95. unique_toolkit/test_utilities/events.py +197 -0
  96. {unique_toolkit-1.8.1.dist-info → unique_toolkit-1.23.0.dist-info}/METADATA +173 -3
  97. {unique_toolkit-1.8.1.dist-info → unique_toolkit-1.23.0.dist-info}/RECORD +99 -67
  98. unique_toolkit/agentic/tools/a2a/postprocessing/_display.py +0 -122
  99. unique_toolkit/agentic/tools/a2a/postprocessing/_utils.py +0 -19
  100. unique_toolkit/agentic/tools/a2a/postprocessing/postprocessor.py +0 -230
  101. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_consolidate_references.py +0 -665
  102. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_display.py +0 -391
  103. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_postprocessor_reference_functions.py +0 -256
  104. {unique_toolkit-1.8.1.dist-info → unique_toolkit-1.23.0.dist-info}/LICENSE +0 -0
  105. {unique_toolkit-1.8.1.dist-info → unique_toolkit-1.23.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,278 @@
1
+ """
2
+ Test suite for DebugInfoManager class.
3
+
4
+ This test suite validates the DebugInfoManager's ability to:
5
+ 1. Initialize with empty debug info
6
+ 2. Extract tool debug info from ToolCallResponse objects
7
+ 3. Handle loop iteration indices
8
+ 4. Add arbitrary key-value pairs to debug info
9
+ 5. Retrieve the complete debug info dictionary
10
+ """
11
+
12
+ from unique_toolkit.agentic.debug_info_manager.debug_info_manager import (
13
+ DebugInfoManager,
14
+ )
15
+ from unique_toolkit.agentic.tools.schemas import ToolCallResponse
16
+
17
+
18
+ class TestDebugInfoManager:
19
+ """Test suite for DebugInfoManager functionality."""
20
+
21
+ def test_init__initializes_empty_debug_info__on_creation(self):
22
+ """Test that DebugInfoManager initializes with empty tools list."""
23
+ manager = DebugInfoManager()
24
+
25
+ assert manager.debug_info == {"tools": []}
26
+ assert manager.get() == {"tools": []}
27
+
28
+ def test_extract_tool_debug_info__adds_single_tool__with_valid_response(self):
29
+ """Test extracting debug info from a single ToolCallResponse."""
30
+ manager = DebugInfoManager()
31
+ tool_call_response = ToolCallResponse(
32
+ id="tool_1",
33
+ name="TestTool",
34
+ debug_info={"execution_time": "100ms", "status": "success"},
35
+ )
36
+
37
+ manager.extract_tool_debug_info([tool_call_response])
38
+
39
+ debug_info = manager.get()
40
+ assert len(debug_info["tools"]) == 1
41
+ assert debug_info["tools"][0]["name"] == "TestTool"
42
+ assert debug_info["tools"][0]["info"]["execution_time"] == "100ms"
43
+ assert debug_info["tools"][0]["info"]["status"] == "success"
44
+
45
+ def test_extract_tool_debug_info__adds_multiple_tools__with_multiple_responses(
46
+ self,
47
+ ):
48
+ """Test extracting debug info from multiple ToolCallResponse objects."""
49
+ manager = DebugInfoManager()
50
+ tool_call_responses = [
51
+ ToolCallResponse(
52
+ id="tool_1",
53
+ name="SearchTool",
54
+ debug_info={"query": "test query", "results": 5},
55
+ ),
56
+ ToolCallResponse(
57
+ id="tool_2",
58
+ name="CalculatorTool",
59
+ debug_info={"operation": "add", "result": 42},
60
+ ),
61
+ ToolCallResponse(
62
+ id="tool_3",
63
+ name="WeatherTool",
64
+ debug_info={"location": "New York", "temperature": "72F"},
65
+ ),
66
+ ]
67
+
68
+ manager.extract_tool_debug_info(tool_call_responses)
69
+
70
+ debug_info = manager.get()
71
+ assert len(debug_info["tools"]) == 3
72
+ assert debug_info["tools"][0]["name"] == "SearchTool"
73
+ assert debug_info["tools"][1]["name"] == "CalculatorTool"
74
+ assert debug_info["tools"][2]["name"] == "WeatherTool"
75
+
76
+ def test_extract_tool_debug_info__preserves_order__with_sequential_calls(self):
77
+ """Test that multiple calls to extract_tool_debug_info preserve order."""
78
+ manager = DebugInfoManager()
79
+
80
+ # First call
81
+ manager.extract_tool_debug_info(
82
+ [ToolCallResponse(id="tool_1", name="Tool1", debug_info={"step": 1})]
83
+ )
84
+
85
+ # Second call
86
+ manager.extract_tool_debug_info(
87
+ [ToolCallResponse(id="tool_2", name="Tool2", debug_info={"step": 2})]
88
+ )
89
+
90
+ # Third call
91
+ manager.extract_tool_debug_info(
92
+ [ToolCallResponse(id="tool_3", name="Tool3", debug_info={"step": 3})]
93
+ )
94
+
95
+ debug_info = manager.get()
96
+ assert len(debug_info["tools"]) == 3
97
+ assert debug_info["tools"][0]["info"]["step"] == 1
98
+ assert debug_info["tools"][1]["info"]["step"] == 2
99
+ assert debug_info["tools"][2]["info"]["step"] == 3
100
+
101
+ def test_extract_tool_debug_info__adds_loop_iteration__when_index_provided(self):
102
+ """Test that loop_iteration_index is added to debug info when provided."""
103
+ manager = DebugInfoManager()
104
+ tool_call_response = ToolCallResponse(
105
+ id="tool_1", name="IterativeTool", debug_info={"status": "processing"}
106
+ )
107
+
108
+ manager.extract_tool_debug_info([tool_call_response], loop_iteration_index=3)
109
+
110
+ debug_info = manager.get()
111
+ assert debug_info["tools"][0]["info"]["loop_iteration"] == 3
112
+ assert debug_info["tools"][0]["info"]["status"] == "processing"
113
+
114
+ def test_extract_tool_debug_info__omits_loop_iteration__when_index_is_none(self):
115
+ """Test that loop_iteration is not added when index is None."""
116
+ manager = DebugInfoManager()
117
+ tool_call_response = ToolCallResponse(
118
+ id="tool_1", name="SingleRunTool", debug_info={"status": "complete"}
119
+ )
120
+
121
+ manager.extract_tool_debug_info([tool_call_response], loop_iteration_index=None)
122
+
123
+ debug_info = manager.get()
124
+ assert "loop_iteration" not in debug_info["tools"][0]["info"]
125
+ assert debug_info["tools"][0]["info"]["status"] == "complete"
126
+
127
+ def test_extract_tool_debug_info__handles_empty_debug_info__gracefully(self):
128
+ """Test extracting from ToolCallResponse with empty debug_info dict."""
129
+ manager = DebugInfoManager()
130
+ tool_call_response = ToolCallResponse(
131
+ id="tool_1", name="MinimalTool", debug_info={}
132
+ )
133
+
134
+ manager.extract_tool_debug_info([tool_call_response])
135
+
136
+ debug_info = manager.get()
137
+ assert len(debug_info["tools"]) == 1
138
+ assert debug_info["tools"][0]["name"] == "MinimalTool"
139
+ assert debug_info["tools"][0]["info"] == {}
140
+
141
+ def test_extract_tool_debug_info__handles_empty_list__without_error(self):
142
+ """Test that passing an empty list doesn't cause errors."""
143
+ manager = DebugInfoManager()
144
+
145
+ manager.extract_tool_debug_info([])
146
+
147
+ debug_info = manager.get()
148
+ assert debug_info["tools"] == []
149
+
150
+ def test_add__adds_new_key_value_pair__to_debug_info(self):
151
+ """Test adding a new key-value pair to debug_info."""
152
+ manager = DebugInfoManager()
153
+
154
+ manager.add("execution_summary", {"total_time": "500ms", "total_calls": 5})
155
+
156
+ debug_info = manager.get()
157
+ assert "execution_summary" in debug_info
158
+ assert debug_info["execution_summary"]["total_time"] == "500ms"
159
+ assert debug_info["execution_summary"]["total_calls"] == 5
160
+
161
+ def test_add__preserves_tools_list__when_adding_new_keys(self):
162
+ """Test that add() preserves the tools list."""
163
+ manager = DebugInfoManager()
164
+ manager.extract_tool_debug_info(
165
+ [
166
+ ToolCallResponse(
167
+ id="tool_1", name="TestTool", debug_info={"test": "data"}
168
+ )
169
+ ]
170
+ )
171
+
172
+ manager.add("metadata", {"version": "1.0"})
173
+
174
+ debug_info = manager.get()
175
+ assert len(debug_info["tools"]) == 1
176
+ assert debug_info["tools"][0]["name"] == "TestTool"
177
+ assert debug_info["metadata"]["version"] == "1.0"
178
+
179
+ def test_add__overwrites_existing_key__when_key_exists(self):
180
+ """Test that add() overwrites an existing key."""
181
+ manager = DebugInfoManager()
182
+ manager.add("status", "in_progress")
183
+ manager.add("status", "completed")
184
+
185
+ debug_info = manager.get()
186
+ assert debug_info["status"] == "completed"
187
+
188
+ def test_add__adds_multiple_keys__with_sequential_calls(self):
189
+ """Test adding multiple key-value pairs with sequential calls."""
190
+ manager = DebugInfoManager()
191
+
192
+ manager.add("key1", "value1")
193
+ manager.add("key2", {"nested": "value2"})
194
+ manager.add("key3", [1, 2, 3])
195
+
196
+ debug_info = manager.get()
197
+ assert debug_info["key1"] == "value1"
198
+ assert debug_info["key2"]["nested"] == "value2"
199
+ assert debug_info["key3"] == [1, 2, 3]
200
+
201
+ def test_get__returns_complete_debug_info__with_mixed_data(self):
202
+ """Test get() returns complete debug info with tools and custom keys."""
203
+ manager = DebugInfoManager()
204
+
205
+ # Add tool debug info
206
+ manager.extract_tool_debug_info(
207
+ [ToolCallResponse(id="tool_1", name="Tool1", debug_info={"data": "test"})],
208
+ loop_iteration_index=0,
209
+ )
210
+
211
+ # Add custom keys
212
+ manager.add("start_time", "2025-10-16T10:00:00")
213
+ manager.add("end_time", "2025-10-16T10:01:00")
214
+
215
+ debug_info = manager.get()
216
+
217
+ assert "tools" in debug_info
218
+ assert "start_time" in debug_info
219
+ assert "end_time" in debug_info
220
+ assert len(debug_info["tools"]) == 1
221
+ assert debug_info["start_time"] == "2025-10-16T10:00:00"
222
+
223
+ def test_integration__complete_workflow__with_all_operations(self):
224
+ """Integration test: complete workflow using all DebugInfoManager methods."""
225
+ manager = DebugInfoManager()
226
+
227
+ # Initial state
228
+ assert manager.get() == {"tools": []}
229
+
230
+ # Add some metadata
231
+ manager.add("session_id", "abc-123")
232
+ manager.add("user_id", "user-456")
233
+
234
+ # First tool call (loop iteration 0)
235
+ manager.extract_tool_debug_info(
236
+ [
237
+ ToolCallResponse(
238
+ id="tool_1",
239
+ name="SearchTool",
240
+ debug_info={"query": "AI research", "hits": 100},
241
+ )
242
+ ],
243
+ loop_iteration_index=0,
244
+ )
245
+
246
+ # Second tool call (loop iteration 1)
247
+ manager.extract_tool_debug_info(
248
+ [
249
+ ToolCallResponse(
250
+ id="tool_2",
251
+ name="AnalysisTool",
252
+ debug_info={"processed": 50, "relevant": 10},
253
+ ),
254
+ ToolCallResponse(
255
+ id="tool_3",
256
+ name="SummaryTool",
257
+ debug_info={"paragraphs": 3, "words": 250},
258
+ ),
259
+ ],
260
+ loop_iteration_index=1,
261
+ )
262
+
263
+ # Add final summary
264
+ manager.add("summary", {"total_tools": 3, "total_iterations": 2})
265
+
266
+ # Verify complete debug info
267
+ debug_info = manager.get()
268
+
269
+ assert debug_info["session_id"] == "abc-123"
270
+ assert debug_info["user_id"] == "user-456"
271
+ assert len(debug_info["tools"]) == 3
272
+ assert debug_info["tools"][0]["name"] == "SearchTool"
273
+ assert debug_info["tools"][0]["info"]["loop_iteration"] == 0
274
+ assert debug_info["tools"][1]["name"] == "AnalysisTool"
275
+ assert debug_info["tools"][1]["info"]["loop_iteration"] == 1
276
+ assert debug_info["tools"][2]["name"] == "SummaryTool"
277
+ assert debug_info["tools"][2]["info"]["loop_iteration"] == 1
278
+ assert debug_info["summary"]["total_tools"] == 3
@@ -4,7 +4,8 @@ from humps import camelize
4
4
  from pydantic import BaseModel, ConfigDict, Field
5
5
 
6
6
  from unique_toolkit._common.validators import LMI
7
- from unique_toolkit.language_model.infos import LanguageModelInfo, LanguageModelName
7
+ from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
8
+ from unique_toolkit.language_model.infos import LanguageModelInfo
8
9
 
9
10
  from .schemas import (
10
11
  EvaluationMetricName,
@@ -24,7 +25,7 @@ class EvaluationMetricConfig(BaseModel):
24
25
  enabled: bool = False
25
26
  name: EvaluationMetricName
26
27
  language_model: LMI = LanguageModelInfo.from_name(
27
- LanguageModelName.AZURE_GPT_35_TURBO_0125,
28
+ DEFAULT_GPT_4o,
28
29
  )
29
30
  additional_llm_options: dict[str, Any] = Field(
30
31
  default={},
@@ -4,7 +4,6 @@ from typing import overload
4
4
  from pydantic import BaseModel, ValidationError
5
5
  from typing_extensions import deprecated
6
6
 
7
- from unique_toolkit._common.default_language_model import DEFAULT_GPT_35_TURBO
8
7
  from unique_toolkit._common.validate_required_values import (
9
8
  validate_required_values,
10
9
  )
@@ -24,6 +23,7 @@ from unique_toolkit.agentic.evaluation.schemas import (
24
23
  EvaluationMetricResult,
25
24
  )
26
25
  from unique_toolkit.app.schemas import BaseEvent, ChatEvent
26
+ from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
27
27
  from unique_toolkit.language_model.infos import (
28
28
  LanguageModelInfo,
29
29
  ModelCapabilities,
@@ -49,7 +49,7 @@ USER_MSG_KEY = "userPrompt"
49
49
  default_config = EvaluationMetricConfig(
50
50
  enabled=False,
51
51
  name=EvaluationMetricName.CONTEXT_RELEVANCY,
52
- language_model=LanguageModelInfo.from_name(DEFAULT_GPT_35_TURBO),
52
+ language_model=LanguageModelInfo.from_name(DEFAULT_GPT_4o),
53
53
  custom_prompts={
54
54
  SYSTEM_MSG_KEY: CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
55
55
  USER_MSG_KEY: CONTEXT_RELEVANCY_METRIC_USER_MSG,
@@ -123,6 +123,15 @@ class EvaluationManager:
123
123
  self._evaluation_passed = False
124
124
  evaluation_results_unpacked.append(unpacked_evaluation_result)
125
125
 
126
+ for evaluation_name, evaluation_result in zip(
127
+ selected_evaluation_names, evaluation_results_unpacked
128
+ ):
129
+ evaluation_instance = self.get_evaluation_by_name(evaluation_name)
130
+ if evaluation_instance:
131
+ await self._show_message_assessment(
132
+ evaluation_instance, evaluation_result, assistant_message_id
133
+ )
134
+
126
135
  return evaluation_results_unpacked
127
136
 
128
137
  async def execute_evaluation_call(
@@ -143,11 +152,6 @@ class EvaluationManager:
143
152
  evaluation_metric_result: EvaluationMetricResult = (
144
153
  await evaluation_instance.run(loop_response)
145
154
  )
146
- # show results to the user
147
- await self._show_message_assessment(
148
- evaluation_instance, evaluation_metric_result, assistant_message_id
149
- )
150
-
151
155
  return evaluation_metric_result
152
156
 
153
157
  return EvaluationMetricResult(
@@ -2,7 +2,6 @@ from typing import Any
2
2
 
3
3
  from pydantic import Field
4
4
 
5
- from unique_toolkit._common.default_language_model import DEFAULT_GPT_4o
6
5
  from unique_toolkit._common.validators import LMI
7
6
  from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
8
7
  from unique_toolkit.agentic.evaluation.hallucination.prompts import (
@@ -15,6 +14,7 @@ from unique_toolkit.agentic.evaluation.schemas import (
15
14
  EvaluationMetricInputFieldName,
16
15
  EvaluationMetricName,
17
16
  )
17
+ from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
18
18
  from unique_toolkit.language_model.infos import LanguageModelInfo
19
19
 
20
20
  SYSTEM_MSG_KEY = "systemPrompt"
@@ -1,3 +1,5 @@
1
+ import regex as re
2
+
1
3
  from unique_toolkit.agentic.evaluation.evaluation_manager import Evaluation
2
4
  from unique_toolkit.agentic.evaluation.hallucination.constants import (
3
5
  HallucinationConfig,
@@ -18,6 +20,7 @@ from unique_toolkit.chat.schemas import (
18
20
  ChatMessageAssessmentStatus,
19
21
  ChatMessageAssessmentType,
20
22
  )
23
+ from unique_toolkit.language_model.reference import _preprocess_message
21
24
  from unique_toolkit.language_model.schemas import (
22
25
  LanguageModelStreamResponse,
23
26
  )
@@ -40,13 +43,25 @@ class HallucinationEvaluation(Evaluation):
40
43
  async def run(
41
44
  self, loop_response: LanguageModelStreamResponse
42
45
  ) -> EvaluationMetricResult: # type: ignore
43
- chunks = self._reference_manager.get_chunks()
46
+ all_chunks = self._reference_manager.get_chunks()
47
+
48
+ # source numbers from original text
49
+ ref_pattern = r"\[(\d+)\]"
50
+ original_text = loop_response.message.original_text
51
+
52
+ # preprocess original text to deal with different source patterns
53
+ original_text_preprocessed = _preprocess_message(original_text)
54
+
55
+ source_number_matches = re.findall(ref_pattern, original_text_preprocessed)
56
+ source_numbers = {int(num) for num in source_number_matches}
57
+
58
+ referenced_chunks = [all_chunks[idx] for idx in source_numbers]
44
59
 
45
60
  evaluation_result: EvaluationMetricResult = await check_hallucination(
46
61
  company_id=self._company_id,
47
62
  input=EvaluationMetricInput(
48
63
  input_text=self._user_message,
49
- context_texts=[context.text for context in chunks],
64
+ context_texts=[context.text for context in referenced_chunks],
50
65
  history_messages=[], # TODO include loop_history messages
51
66
  output_text=loop_response.message.text,
52
67
  ),
@@ -78,11 +93,19 @@ class HallucinationEvaluation(Evaluation):
78
93
  if not evaluation_result.error
79
94
  else ChatMessageAssessmentStatus.ERROR
80
95
  )
96
+ explanation = evaluation_result.reason
97
+
98
+ if status == ChatMessageAssessmentStatus.ERROR:
99
+ title = "Hallucination Check Error"
100
+ label = ChatMessageAssessmentLabel.RED
101
+ explanation = (
102
+ "An unrecoverable error occurred while evaluating the response."
103
+ )
81
104
 
82
105
  return EvaluationAssessmentMessage(
83
106
  status=status,
84
107
  title=title,
85
- explanation=evaluation_result.reason,
108
+ explanation=explanation,
86
109
  label=label,
87
110
  type=self.get_assessment_type(),
88
111
  )
@@ -3,6 +3,9 @@ from typing import Annotated, Awaitable, Callable
3
3
 
4
4
  from pydantic import BaseModel, Field
5
5
 
6
+ from unique_toolkit._common.feature_flags.schema import (
7
+ FeatureExtendedSourceSerialization,
8
+ )
6
9
  from unique_toolkit._common.validators import LMI
7
10
  from unique_toolkit.agentic.history_manager.loop_token_reducer import LoopTokenReducer
8
11
  from unique_toolkit.agentic.history_manager.utils import transform_chunks_to_string
@@ -10,7 +13,8 @@ from unique_toolkit.agentic.reference_manager.reference_manager import Reference
10
13
  from unique_toolkit.agentic.tools.config import get_configuration_dict
11
14
  from unique_toolkit.agentic.tools.schemas import ToolCallResponse
12
15
  from unique_toolkit.app.schemas import ChatEvent
13
- from unique_toolkit.language_model.infos import LanguageModelInfo, LanguageModelName
16
+ from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
17
+ from unique_toolkit.language_model.infos import LanguageModelInfo
14
18
  from unique_toolkit.language_model.schemas import (
15
19
  LanguageModelAssistantMessage,
16
20
  LanguageModelFunction,
@@ -41,11 +45,7 @@ class UploadedContentConfig(BaseModel):
41
45
  )
42
46
 
43
47
 
44
- class ExperimentalFeatures(BaseModel):
45
- full_sources_serialize_dump: bool = Field(
46
- default=False,
47
- description="If True, the sources will be serialized in full, otherwise only the content will be serialized.",
48
- )
48
+ class ExperimentalFeatures(FeatureExtendedSourceSerialization): ...
49
49
 
50
50
 
51
51
  class HistoryManagerConfig(BaseModel):
@@ -61,9 +61,7 @@ class HistoryManagerConfig(BaseModel):
61
61
  description="The fraction of the max input tokens that will be reserved for the history.",
62
62
  )
63
63
 
64
- language_model: LMI = LanguageModelInfo.from_name(
65
- LanguageModelName.AZURE_GPT_4o_2024_1120
66
- )
64
+ language_model: LMI = LanguageModelInfo.from_name(DEFAULT_GPT_4o)
67
65
 
68
66
  @property
69
67
  def max_history_tokens(self) -> int:
@@ -122,9 +120,16 @@ class HistoryManager:
122
120
  reference_manager=reference_manager,
123
121
  )
124
122
  self._tool_call_result_history: list[ToolCallResponse] = []
123
+ self._tool_calls: list[LanguageModelFunction] = []
125
124
  self._loop_history: list[LanguageModelMessage] = []
126
125
  self._source_enumerator = 0
127
126
 
127
+ def add_tool_call(self, tool_call: LanguageModelFunction) -> None:
128
+ self._tool_calls.append(tool_call)
129
+
130
+ def get_tool_calls(self) -> list[LanguageModelFunction]:
131
+ return self._tool_calls
132
+
128
133
  def has_no_loop_messages(self) -> bool:
129
134
  return len(self._loop_history) == 0
130
135
 
@@ -173,8 +178,6 @@ class HistoryManager:
173
178
  stringified_sources, sources = transform_chunks_to_string(
174
179
  content_chunks,
175
180
  self._source_enumerator,
176
- None, # Use None for SourceFormatConfig
177
- self._config.experimental_features.full_sources_serialize_dump,
178
181
  )
179
182
 
180
183
  self._source_enumerator += len(
@@ -266,7 +266,9 @@ class LoopTokenReducer:
266
266
  selected_messages = []
267
267
  token_count = 0
268
268
  for msg in messages[::-1]:
269
- msg_token_count = self._count_tokens(str(msg.content))
269
+ msg_token_count = self._count_message_tokens(
270
+ LanguageModelMessages(root=[msg])
271
+ )
270
272
  if token_count + msg_token_count > token_limit:
271
273
  break
272
274
  selected_messages.append(msg)
@@ -293,9 +295,6 @@ class LoopTokenReducer:
293
295
  )
294
296
  return messages
295
297
 
296
- def _count_tokens(self, text: str) -> int:
297
- return len(self._encoder.encode(text))
298
-
299
298
  def ensure_last_message_is_user_message(self, limited_history_messages):
300
299
  """
301
300
  As the token limit can be reached in the middle of a gpt_request,
@@ -1,12 +1,10 @@
1
1
  import json
2
2
  import logging
3
3
  from copy import deepcopy
4
+ from typing import Any
4
5
 
5
6
  from unique_toolkit.agentic.tools.schemas import Source
6
- from unique_toolkit.agentic.tools.utils.source_handling.schema import (
7
- SourceFormatConfig,
8
- )
9
- from unique_toolkit.content.schemas import ContentChunk, ContentMetadata
7
+ from unique_toolkit.content.schemas import ContentChunk
10
8
  from unique_toolkit.language_model.schemas import (
11
9
  LanguageModelAssistantMessage,
12
10
  LanguageModelMessage,
@@ -62,58 +60,25 @@ def _convert_tool_call_response_to_content(
62
60
  def transform_chunks_to_string(
63
61
  content_chunks: list[ContentChunk],
64
62
  max_source_number: int,
65
- cfg: SourceFormatConfig | None,
66
- full_sources_serialize_dump: bool = False,
67
- ) -> tuple[str, list[Source]]:
63
+ ) -> tuple[str, list[dict[str, Any]]]:
68
64
  """Transform content chunks into a string of sources.
69
65
 
70
66
  Args:
71
67
  content_chunks (list[ContentChunk]): The content chunks to transform
72
68
  max_source_number (int): The maximum source number to use
73
- feature_full_sources (bool, optional): Whether to include the full source object. Defaults to False which is the old format.
74
69
 
75
70
  Returns:
76
71
  str: String for the tool call response
77
72
  """
78
73
  if not content_chunks:
79
74
  return "No relevant sources found.", []
80
- if full_sources_serialize_dump:
81
- sources = [
82
- Source(
83
- source_number=max_source_number + i,
84
- key=chunk.key,
85
- id=chunk.id,
86
- order=chunk.order,
87
- content=chunk.text,
88
- chunk_id=chunk.chunk_id,
89
- metadata=(
90
- _format_metadata(chunk.metadata, cfg) or None
91
- if chunk.metadata
92
- else None
93
- ),
94
- url=chunk.url,
95
- ).model_dump(
96
- exclude_none=True,
97
- exclude_defaults=True,
98
- by_alias=True,
99
- )
100
- for i, chunk in enumerate(content_chunks)
101
- ]
102
- else:
103
- sources = [
104
- {
105
- "source_number": max_source_number + i,
106
- "content": chunk.text,
107
- **(
108
- {"metadata": meta}
109
- if (
110
- meta := _format_metadata(chunk.metadata, cfg)
111
- ) # only add when not empty
112
- else {}
113
- ),
114
- }
115
- for i, chunk in enumerate(content_chunks)
116
- ]
75
+ sources: list[dict[str, Any]] = [
76
+ {
77
+ "source_number": max_source_number + i,
78
+ "content": chunk.text,
79
+ }
80
+ for i, chunk in enumerate(content_chunks)
81
+ ]
117
82
  return json.dumps(sources), sources
118
83
 
119
84
 
@@ -129,45 +94,3 @@ def load_sources_from_string(
129
94
  except (json.JSONDecodeError, ValueError):
130
95
  logger.warning("Failed to parse source string")
131
96
  return None
132
-
133
-
134
- def _format_metadata(
135
- metadata: ContentMetadata | None,
136
- cfg: SourceFormatConfig | None,
137
- ) -> str:
138
- """
139
- Build the concatenated tag string from the chunk's metadata
140
- and the templates found in cfg.sections.
141
- Example result:
142
- "<|topic|>GenAI<|/topic|>\n<|date|>This document is from: 2025-04-29<|/date|>\n"
143
- """
144
- if metadata is None:
145
- return ""
146
-
147
- if cfg is None or not cfg.sections:
148
- # If no configuration is provided, return empty string
149
- return ""
150
-
151
- meta_dict = metadata.model_dump(exclude_none=True, by_alias=True)
152
- sections = cfg.sections
153
-
154
- parts: list[str] = []
155
- for key, template in sections.items():
156
- if key in meta_dict:
157
- parts.append(template.format(meta_dict[key]))
158
-
159
- return "".join(parts)
160
-
161
-
162
- ### In case we do not want any formatting of metadata we could use this function
163
- # def _filtered_metadata(
164
- # meta: ContentMetadata | None,
165
- # cfg: SourceFormatConfig,
166
- # ) -> dict[str, str] | None:
167
- # if meta is None:
168
- # return None
169
-
170
- # allowed = set(cfg.sections)
171
- # raw = meta.model_dump(exclude_none=True, by_alias=True)
172
- # pruned = {k: v for k, v in raw.items() if k in allowed}
173
- # return pruned or None