unique_toolkit 0.7.9__py3-none-any.whl → 1.33.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_toolkit/__init__.py +36 -3
- unique_toolkit/_common/api_calling/human_verification_manager.py +357 -0
- unique_toolkit/_common/base_model_type_attribute.py +303 -0
- unique_toolkit/_common/chunk_relevancy_sorter/config.py +49 -0
- unique_toolkit/_common/chunk_relevancy_sorter/exception.py +5 -0
- unique_toolkit/_common/chunk_relevancy_sorter/schemas.py +46 -0
- unique_toolkit/_common/chunk_relevancy_sorter/service.py +374 -0
- unique_toolkit/_common/chunk_relevancy_sorter/tests/test_service.py +275 -0
- unique_toolkit/_common/default_language_model.py +12 -0
- unique_toolkit/_common/docx_generator/__init__.py +7 -0
- unique_toolkit/_common/docx_generator/config.py +12 -0
- unique_toolkit/_common/docx_generator/schemas.py +80 -0
- unique_toolkit/_common/docx_generator/service.py +225 -0
- unique_toolkit/_common/docx_generator/template/Doc Template.docx +0 -0
- unique_toolkit/_common/endpoint_builder.py +368 -0
- unique_toolkit/_common/endpoint_requestor.py +480 -0
- unique_toolkit/_common/exception.py +24 -0
- unique_toolkit/_common/experimental/endpoint_builder.py +368 -0
- unique_toolkit/_common/experimental/endpoint_requestor.py +488 -0
- unique_toolkit/_common/feature_flags/schema.py +9 -0
- unique_toolkit/_common/pydantic/rjsf_tags.py +936 -0
- unique_toolkit/_common/pydantic_helpers.py +174 -0
- unique_toolkit/_common/referencing.py +53 -0
- unique_toolkit/_common/string_utilities.py +140 -0
- unique_toolkit/_common/tests/test_referencing.py +521 -0
- unique_toolkit/_common/tests/test_string_utilities.py +506 -0
- unique_toolkit/_common/token/image_token_counting.py +67 -0
- unique_toolkit/_common/token/token_counting.py +204 -0
- unique_toolkit/_common/utils/__init__.py +1 -0
- unique_toolkit/_common/utils/files.py +43 -0
- unique_toolkit/_common/utils/image/encode.py +25 -0
- unique_toolkit/_common/utils/jinja/helpers.py +10 -0
- unique_toolkit/_common/utils/jinja/render.py +18 -0
- unique_toolkit/_common/utils/jinja/schema.py +65 -0
- unique_toolkit/_common/utils/jinja/utils.py +80 -0
- unique_toolkit/_common/utils/structured_output/__init__.py +1 -0
- unique_toolkit/_common/utils/structured_output/schema.py +5 -0
- unique_toolkit/_common/utils/write_configuration.py +51 -0
- unique_toolkit/_common/validators.py +101 -4
- unique_toolkit/agentic/__init__.py +1 -0
- unique_toolkit/agentic/debug_info_manager/debug_info_manager.py +28 -0
- unique_toolkit/agentic/debug_info_manager/test/test_debug_info_manager.py +278 -0
- unique_toolkit/agentic/evaluation/config.py +36 -0
- unique_toolkit/{evaluators → agentic/evaluation}/context_relevancy/prompts.py +25 -0
- unique_toolkit/agentic/evaluation/context_relevancy/schema.py +80 -0
- unique_toolkit/agentic/evaluation/context_relevancy/service.py +273 -0
- unique_toolkit/agentic/evaluation/evaluation_manager.py +218 -0
- unique_toolkit/agentic/evaluation/hallucination/constants.py +61 -0
- unique_toolkit/agentic/evaluation/hallucination/hallucination_evaluation.py +112 -0
- unique_toolkit/{evaluators → agentic/evaluation}/hallucination/prompts.py +1 -1
- unique_toolkit/{evaluators → agentic/evaluation}/hallucination/service.py +20 -16
- unique_toolkit/{evaluators → agentic/evaluation}/hallucination/utils.py +32 -21
- unique_toolkit/{evaluators → agentic/evaluation}/output_parser.py +20 -2
- unique_toolkit/{evaluators → agentic/evaluation}/schemas.py +27 -7
- unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +253 -0
- unique_toolkit/agentic/evaluation/tests/test_output_parser.py +87 -0
- unique_toolkit/agentic/history_manager/history_construction_with_contents.py +298 -0
- unique_toolkit/agentic/history_manager/history_manager.py +241 -0
- unique_toolkit/agentic/history_manager/loop_token_reducer.py +484 -0
- unique_toolkit/agentic/history_manager/utils.py +96 -0
- unique_toolkit/agentic/message_log_manager/__init__.py +5 -0
- unique_toolkit/agentic/message_log_manager/service.py +93 -0
- unique_toolkit/agentic/postprocessor/postprocessor_manager.py +212 -0
- unique_toolkit/agentic/reference_manager/reference_manager.py +103 -0
- unique_toolkit/agentic/responses_api/__init__.py +19 -0
- unique_toolkit/agentic/responses_api/postprocessors/code_display.py +71 -0
- unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +297 -0
- unique_toolkit/agentic/responses_api/stream_handler.py +15 -0
- unique_toolkit/agentic/short_term_memory_manager/persistent_short_term_memory_manager.py +141 -0
- unique_toolkit/agentic/thinking_manager/thinking_manager.py +103 -0
- unique_toolkit/agentic/tools/__init__.py +1 -0
- unique_toolkit/agentic/tools/a2a/__init__.py +36 -0
- unique_toolkit/agentic/tools/a2a/config.py +17 -0
- unique_toolkit/agentic/tools/a2a/evaluation/__init__.py +15 -0
- unique_toolkit/agentic/tools/a2a/evaluation/_utils.py +66 -0
- unique_toolkit/agentic/tools/a2a/evaluation/config.py +55 -0
- unique_toolkit/agentic/tools/a2a/evaluation/evaluator.py +260 -0
- unique_toolkit/agentic/tools/a2a/evaluation/summarization_user_message.j2 +9 -0
- unique_toolkit/agentic/tools/a2a/manager.py +55 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/__init__.py +21 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/_display_utils.py +240 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/_ref_utils.py +84 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/config.py +78 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/display.py +264 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/references.py +101 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/test/test_display.py +421 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/test/test_display_utils.py +2103 -0
- unique_toolkit/agentic/tools/a2a/postprocessing/test/test_ref_utils.py +603 -0
- unique_toolkit/agentic/tools/a2a/prompts.py +46 -0
- unique_toolkit/agentic/tools/a2a/response_watcher/__init__.py +6 -0
- unique_toolkit/agentic/tools/a2a/response_watcher/service.py +91 -0
- unique_toolkit/agentic/tools/a2a/tool/__init__.py +4 -0
- unique_toolkit/agentic/tools/a2a/tool/_memory.py +26 -0
- unique_toolkit/agentic/tools/a2a/tool/_schema.py +9 -0
- unique_toolkit/agentic/tools/a2a/tool/config.py +158 -0
- unique_toolkit/agentic/tools/a2a/tool/service.py +393 -0
- unique_toolkit/agentic/tools/agent_chunks_hanlder.py +65 -0
- unique_toolkit/agentic/tools/config.py +128 -0
- unique_toolkit/agentic/tools/factory.py +44 -0
- unique_toolkit/agentic/tools/mcp/__init__.py +4 -0
- unique_toolkit/agentic/tools/mcp/manager.py +71 -0
- unique_toolkit/agentic/tools/mcp/models.py +28 -0
- unique_toolkit/agentic/tools/mcp/tool_wrapper.py +234 -0
- unique_toolkit/agentic/tools/openai_builtin/__init__.py +11 -0
- unique_toolkit/agentic/tools/openai_builtin/base.py +46 -0
- unique_toolkit/agentic/tools/openai_builtin/code_interpreter/__init__.py +8 -0
- unique_toolkit/agentic/tools/openai_builtin/code_interpreter/config.py +88 -0
- unique_toolkit/agentic/tools/openai_builtin/code_interpreter/service.py +250 -0
- unique_toolkit/agentic/tools/openai_builtin/manager.py +79 -0
- unique_toolkit/agentic/tools/schemas.py +145 -0
- unique_toolkit/agentic/tools/test/test_mcp_manager.py +536 -0
- unique_toolkit/agentic/tools/test/test_tool_progress_reporter.py +445 -0
- unique_toolkit/agentic/tools/tool.py +187 -0
- unique_toolkit/agentic/tools/tool_manager.py +492 -0
- unique_toolkit/agentic/tools/tool_progress_reporter.py +285 -0
- unique_toolkit/agentic/tools/utils/__init__.py +19 -0
- unique_toolkit/agentic/tools/utils/execution/__init__.py +1 -0
- unique_toolkit/agentic/tools/utils/execution/execution.py +286 -0
- unique_toolkit/agentic/tools/utils/source_handling/__init__.py +0 -0
- unique_toolkit/agentic/tools/utils/source_handling/schema.py +21 -0
- unique_toolkit/agentic/tools/utils/source_handling/source_formatting.py +207 -0
- unique_toolkit/agentic/tools/utils/source_handling/tests/test_source_formatting.py +216 -0
- unique_toolkit/app/__init__.py +9 -0
- unique_toolkit/app/dev_util.py +180 -0
- unique_toolkit/app/fast_api_factory.py +131 -0
- unique_toolkit/app/init_sdk.py +32 -1
- unique_toolkit/app/schemas.py +206 -31
- unique_toolkit/app/unique_settings.py +367 -0
- unique_toolkit/app/webhook.py +77 -0
- unique_toolkit/chat/__init__.py +8 -1
- unique_toolkit/chat/deprecated/service.py +232 -0
- unique_toolkit/chat/functions.py +648 -78
- unique_toolkit/chat/rendering.py +34 -0
- unique_toolkit/chat/responses_api.py +461 -0
- unique_toolkit/chat/schemas.py +134 -2
- unique_toolkit/chat/service.py +115 -767
- unique_toolkit/content/functions.py +353 -8
- unique_toolkit/content/schemas.py +128 -15
- unique_toolkit/content/service.py +321 -45
- unique_toolkit/content/smart_rules.py +301 -0
- unique_toolkit/content/utils.py +10 -3
- unique_toolkit/data_extraction/README.md +96 -0
- unique_toolkit/data_extraction/__init__.py +11 -0
- unique_toolkit/data_extraction/augmented/__init__.py +5 -0
- unique_toolkit/data_extraction/augmented/service.py +93 -0
- unique_toolkit/data_extraction/base.py +25 -0
- unique_toolkit/data_extraction/basic/__init__.py +11 -0
- unique_toolkit/data_extraction/basic/config.py +18 -0
- unique_toolkit/data_extraction/basic/prompt.py +13 -0
- unique_toolkit/data_extraction/basic/service.py +55 -0
- unique_toolkit/embedding/service.py +103 -12
- unique_toolkit/framework_utilities/__init__.py +1 -0
- unique_toolkit/framework_utilities/langchain/__init__.py +10 -0
- unique_toolkit/framework_utilities/langchain/client.py +71 -0
- unique_toolkit/framework_utilities/langchain/history.py +19 -0
- unique_toolkit/framework_utilities/openai/__init__.py +6 -0
- unique_toolkit/framework_utilities/openai/client.py +84 -0
- unique_toolkit/framework_utilities/openai/message_builder.py +229 -0
- unique_toolkit/framework_utilities/utils.py +23 -0
- unique_toolkit/language_model/__init__.py +3 -0
- unique_toolkit/language_model/_responses_api_utils.py +93 -0
- unique_toolkit/language_model/builder.py +27 -11
- unique_toolkit/language_model/default_language_model.py +3 -0
- unique_toolkit/language_model/functions.py +345 -43
- unique_toolkit/language_model/infos.py +1288 -46
- unique_toolkit/language_model/reference.py +242 -0
- unique_toolkit/language_model/schemas.py +481 -49
- unique_toolkit/language_model/service.py +229 -28
- unique_toolkit/protocols/support.py +145 -0
- unique_toolkit/services/__init__.py +7 -0
- unique_toolkit/services/chat_service.py +1631 -0
- unique_toolkit/services/knowledge_base.py +1094 -0
- unique_toolkit/short_term_memory/service.py +178 -41
- unique_toolkit/smart_rules/__init__.py +0 -0
- unique_toolkit/smart_rules/compile.py +56 -0
- unique_toolkit/test_utilities/events.py +197 -0
- unique_toolkit-1.33.3.dist-info/METADATA +1145 -0
- unique_toolkit-1.33.3.dist-info/RECORD +205 -0
- unique_toolkit/evaluators/__init__.py +0 -1
- unique_toolkit/evaluators/config.py +0 -35
- unique_toolkit/evaluators/constants.py +0 -1
- unique_toolkit/evaluators/context_relevancy/constants.py +0 -32
- unique_toolkit/evaluators/context_relevancy/service.py +0 -53
- unique_toolkit/evaluators/context_relevancy/utils.py +0 -142
- unique_toolkit/evaluators/hallucination/constants.py +0 -41
- unique_toolkit-0.7.9.dist-info/METADATA +0 -413
- unique_toolkit-0.7.9.dist-info/RECORD +0 -64
- /unique_toolkit/{evaluators → agentic/evaluation}/exception.py +0 -0
- {unique_toolkit-0.7.9.dist-info → unique_toolkit-1.33.3.dist-info}/LICENSE +0 -0
- {unique_toolkit-0.7.9.dist-info → unique_toolkit-1.33.3.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from datetime import datetime, timedelta, timezone
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Any, Dict, List, Mapping, Self, Union
|
|
5
|
+
|
|
6
|
+
from pydantic import AliasChoices, BaseModel, Field
|
|
7
|
+
from pydantic.config import ConfigDict
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Operator(str, Enum):
|
|
11
|
+
EQUALS = "equals"
|
|
12
|
+
NOT_EQUALS = "notEquals"
|
|
13
|
+
GREATER_THAN = "greaterThan"
|
|
14
|
+
GREATER_THAN_OR_EQUAL = "greaterThanOrEqual"
|
|
15
|
+
LESS_THAN = "lessThan"
|
|
16
|
+
LESS_THAN_OR_EQUAL = "lessThanOrEqual"
|
|
17
|
+
IN = "in"
|
|
18
|
+
NOT_IN = "notIn"
|
|
19
|
+
CONTAINS = "contains"
|
|
20
|
+
NOT_CONTAINS = "notContains"
|
|
21
|
+
IS_NULL = "isNull"
|
|
22
|
+
IS_NOT_NULL = "isNotNull"
|
|
23
|
+
IS_EMPTY = "isEmpty"
|
|
24
|
+
IS_NOT_EMPTY = "isNotEmpty"
|
|
25
|
+
NESTED = "nested"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BaseStatement(BaseModel):
|
|
29
|
+
model_config = ConfigDict(serialize_by_alias=True)
|
|
30
|
+
|
|
31
|
+
def with_variables(
|
|
32
|
+
self,
|
|
33
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
34
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
35
|
+
) -> Self:
|
|
36
|
+
return self._fill_in_variables(user_metadata, tool_parameters)
|
|
37
|
+
|
|
38
|
+
def is_compiled(self) -> bool:
|
|
39
|
+
# Serialize the object to json string
|
|
40
|
+
json_str = self.model_dump_json()
|
|
41
|
+
# Check if the json string has <T> or <T+> or <T-> or <toolParameters or <userMetadata
|
|
42
|
+
return (
|
|
43
|
+
"<T>" in json_str
|
|
44
|
+
or "<T+" in json_str
|
|
45
|
+
or "<T-" in json_str
|
|
46
|
+
or "<toolParameters" in json_str
|
|
47
|
+
or "<userMetadata" in json_str
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def _fill_in_variables(
|
|
51
|
+
self,
|
|
52
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
53
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
54
|
+
) -> Self:
|
|
55
|
+
return self.model_copy()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Statement(BaseStatement):
|
|
59
|
+
operator: Operator
|
|
60
|
+
value: Union[str, int, bool, list[str], "AndStatement", "OrStatement"]
|
|
61
|
+
path: List[str] = Field(default_factory=list)
|
|
62
|
+
|
|
63
|
+
def _fill_in_variables(
|
|
64
|
+
self,
|
|
65
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
66
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
67
|
+
) -> Self:
|
|
68
|
+
new_stmt = self.model_copy()
|
|
69
|
+
new_stmt.value = eval_operator(self, user_metadata, tool_parameters)
|
|
70
|
+
return new_stmt
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class AndStatement(BaseStatement):
|
|
74
|
+
and_list: List[Union["Statement", "AndStatement", "OrStatement"]] = Field(
|
|
75
|
+
validation_alias=AliasChoices("and", "and_list"), serialization_alias="and"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def _fill_in_variables(
|
|
79
|
+
self,
|
|
80
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
81
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
82
|
+
) -> Self:
|
|
83
|
+
new_stmt = self.model_copy()
|
|
84
|
+
new_stmt.and_list = [
|
|
85
|
+
sub_query._fill_in_variables(user_metadata, tool_parameters)
|
|
86
|
+
for sub_query in self.and_list
|
|
87
|
+
]
|
|
88
|
+
return new_stmt
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class OrStatement(BaseStatement):
|
|
92
|
+
or_list: List[Union["Statement", "AndStatement", "OrStatement"]] = Field(
|
|
93
|
+
validation_alias=AliasChoices("or", "or_list"), serialization_alias="or"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def _fill_in_variables(
|
|
97
|
+
self,
|
|
98
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
99
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
100
|
+
) -> Self:
|
|
101
|
+
new_stmt = self.model_copy()
|
|
102
|
+
new_stmt.or_list = [
|
|
103
|
+
sub_query._fill_in_variables(user_metadata, tool_parameters)
|
|
104
|
+
for sub_query in self.or_list
|
|
105
|
+
]
|
|
106
|
+
return new_stmt
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# Update the forward references
|
|
110
|
+
Statement.model_rebuild()
|
|
111
|
+
AndStatement.model_rebuild()
|
|
112
|
+
OrStatement.model_rebuild()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
UniqueQL = Union[Statement, AndStatement, OrStatement]
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def is_array_of_strings(value: Any) -> bool:
|
|
119
|
+
return isinstance(value, list) and all(isinstance(item, str) for item in value)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def eval_operator(
|
|
123
|
+
query: Statement,
|
|
124
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
125
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
126
|
+
) -> Any:
|
|
127
|
+
if query.operator in [
|
|
128
|
+
Operator.EQUALS,
|
|
129
|
+
Operator.NOT_EQUALS,
|
|
130
|
+
Operator.GREATER_THAN,
|
|
131
|
+
Operator.GREATER_THAN_OR_EQUAL,
|
|
132
|
+
Operator.LESS_THAN,
|
|
133
|
+
Operator.LESS_THAN_OR_EQUAL,
|
|
134
|
+
Operator.CONTAINS,
|
|
135
|
+
Operator.NOT_CONTAINS,
|
|
136
|
+
]:
|
|
137
|
+
return binary_operator(query.value, user_metadata, tool_parameters)
|
|
138
|
+
elif query.operator in [Operator.IS_NULL, Operator.IS_NOT_NULL]:
|
|
139
|
+
return null_operator(query.value, user_metadata, tool_parameters)
|
|
140
|
+
elif query.operator in [Operator.IS_EMPTY, Operator.IS_NOT_EMPTY]:
|
|
141
|
+
return empty_operator(query.operator, user_metadata, tool_parameters)
|
|
142
|
+
elif query.operator == Operator.NESTED:
|
|
143
|
+
return eval_nested_operator(query.value, user_metadata, tool_parameters)
|
|
144
|
+
elif query.operator in [Operator.IN, Operator.NOT_IN]:
|
|
145
|
+
return array_operator(query.value, user_metadata, tool_parameters)
|
|
146
|
+
else:
|
|
147
|
+
raise ValueError(f"Operator {query.operator} not supported")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def eval_nested_operator(
|
|
151
|
+
value: Any,
|
|
152
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
153
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
154
|
+
) -> Union[AndStatement, OrStatement]:
|
|
155
|
+
if not isinstance(value, (AndStatement, OrStatement)):
|
|
156
|
+
raise ValueError("Nested operator must be an AndStatement or OrStatement")
|
|
157
|
+
return value._fill_in_variables(user_metadata, tool_parameters)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def binary_operator(
|
|
161
|
+
value: Any,
|
|
162
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
163
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
164
|
+
) -> Any:
|
|
165
|
+
return replace_variables(value, user_metadata, tool_parameters)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def array_operator(
|
|
169
|
+
value: Any,
|
|
170
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
171
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
172
|
+
) -> Any:
|
|
173
|
+
if is_array_of_strings(value):
|
|
174
|
+
return [
|
|
175
|
+
replace_variables(item, user_metadata, tool_parameters) for item in value
|
|
176
|
+
]
|
|
177
|
+
return value
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def null_operator(
|
|
181
|
+
value: Any,
|
|
182
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
183
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
184
|
+
) -> Any:
|
|
185
|
+
return value # do nothing for now. No variables to replace
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def empty_operator(
|
|
189
|
+
operator: Operator,
|
|
190
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
191
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
192
|
+
) -> Any:
|
|
193
|
+
"""Handle IS_EMPTY and IS_NOT_EMPTY operators."""
|
|
194
|
+
if operator == Operator.IS_EMPTY:
|
|
195
|
+
return ""
|
|
196
|
+
elif operator == Operator.IS_NOT_EMPTY:
|
|
197
|
+
return "not_empty"
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def calculate_current_date() -> str:
|
|
202
|
+
"""Calculate current date in UTC with seconds precision."""
|
|
203
|
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def calculate_earlier_date(input_str: str) -> str:
|
|
207
|
+
match = re.search(r"<T-(\d+)>", input_str)
|
|
208
|
+
if not match:
|
|
209
|
+
return calculate_current_date() # Return current date if no match
|
|
210
|
+
days = int(match.group(1))
|
|
211
|
+
return (datetime.now(timezone.utc) - timedelta(days=days)).isoformat(
|
|
212
|
+
timespec="seconds"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def calculate_later_date(input_str: str) -> str:
|
|
217
|
+
match = re.search(r"<T\+(\d+)>", input_str) # Note: escaped + in regex
|
|
218
|
+
if not match:
|
|
219
|
+
return calculate_current_date() # Return current date if no match
|
|
220
|
+
days = int(match.group(1))
|
|
221
|
+
return (datetime.now(timezone.utc) + timedelta(days=days)).isoformat(
|
|
222
|
+
timespec="seconds"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def replace_variables(
|
|
227
|
+
value: Any,
|
|
228
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
229
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
230
|
+
) -> Any:
|
|
231
|
+
if isinstance(value, str):
|
|
232
|
+
if "||" in value:
|
|
233
|
+
return get_fallback_values(value, user_metadata, tool_parameters)
|
|
234
|
+
elif value == "<T>":
|
|
235
|
+
return calculate_current_date()
|
|
236
|
+
elif "<T-" in value:
|
|
237
|
+
return calculate_earlier_date(value)
|
|
238
|
+
elif "<T+" in value:
|
|
239
|
+
return calculate_later_date(value)
|
|
240
|
+
|
|
241
|
+
value = replace_tool_parameters_patterns(value, tool_parameters)
|
|
242
|
+
value = replace_user_metadata_patterns(value, user_metadata)
|
|
243
|
+
|
|
244
|
+
if value == "":
|
|
245
|
+
return value
|
|
246
|
+
try:
|
|
247
|
+
return int(value)
|
|
248
|
+
except ValueError:
|
|
249
|
+
if value.lower() in ["true", "false"]:
|
|
250
|
+
return value.lower() == "true"
|
|
251
|
+
return value
|
|
252
|
+
return value
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def replace_tool_parameters_patterns(
|
|
256
|
+
value: str, tool_parameters: Dict[str, Union[str, int, bool]]
|
|
257
|
+
) -> str:
|
|
258
|
+
def replace_match(match):
|
|
259
|
+
param_name = match.group(1)
|
|
260
|
+
return str(tool_parameters.get(param_name, ""))
|
|
261
|
+
|
|
262
|
+
return re.sub(r"<toolParameters\.(\w+)>", replace_match, value)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def replace_user_metadata_patterns(
|
|
266
|
+
value: str, user_metadata: Dict[str, Union[str, int, bool]]
|
|
267
|
+
) -> str:
|
|
268
|
+
def replace_match(match):
|
|
269
|
+
param_name = match.group(1)
|
|
270
|
+
return str(user_metadata.get(param_name, ""))
|
|
271
|
+
|
|
272
|
+
return re.sub(r"<userMetadata\.(\w+)>", replace_match, value)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def get_fallback_values(
|
|
276
|
+
value: str,
|
|
277
|
+
user_metadata: Mapping[str, Union[str, int, bool]],
|
|
278
|
+
tool_parameters: Mapping[str, Union[str, int, bool]],
|
|
279
|
+
) -> Any:
|
|
280
|
+
values = value.split("||")
|
|
281
|
+
for val in values:
|
|
282
|
+
data = replace_variables(val, user_metadata, tool_parameters)
|
|
283
|
+
if data != "":
|
|
284
|
+
return data
|
|
285
|
+
return values
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
# Example usage:
|
|
289
|
+
def parse_uniqueql(json_data: Dict[str, Any]) -> UniqueQL:
|
|
290
|
+
if "operator" in json_data:
|
|
291
|
+
return Statement.model_validate(json_data)
|
|
292
|
+
elif "or" in json_data:
|
|
293
|
+
return OrStatement.model_validate(
|
|
294
|
+
{"or": [parse_uniqueql(item) for item in json_data["or"]]}
|
|
295
|
+
)
|
|
296
|
+
elif "and" in json_data:
|
|
297
|
+
return AndStatement.model_validate(
|
|
298
|
+
{"and": [parse_uniqueql(item) for item in json_data["and"]]}
|
|
299
|
+
)
|
|
300
|
+
else:
|
|
301
|
+
raise ValueError("Invalid UniqueQL format")
|
unique_toolkit/content/utils.py
CHANGED
|
@@ -190,9 +190,11 @@ def count_tokens(text: str, encoding_model="cl100k_base") -> int:
|
|
|
190
190
|
return len(encoding.encode(text))
|
|
191
191
|
|
|
192
192
|
|
|
193
|
-
def map_content_chunk(content_chunk: dict):
|
|
193
|
+
def map_content_chunk(content_id: str, content_key: str, content_chunk: dict):
|
|
194
194
|
return ContentChunk(
|
|
195
|
-
id=
|
|
195
|
+
id=content_id,
|
|
196
|
+
key=content_key,
|
|
197
|
+
chunk_id=content_chunk["id"],
|
|
196
198
|
text=content_chunk["text"],
|
|
197
199
|
start_page=content_chunk["startPage"],
|
|
198
200
|
end_page=content_chunk["endPage"],
|
|
@@ -206,9 +208,14 @@ def map_content(content: dict):
|
|
|
206
208
|
key=content["key"],
|
|
207
209
|
title=content["title"],
|
|
208
210
|
url=content["url"],
|
|
209
|
-
chunks=[
|
|
211
|
+
chunks=[
|
|
212
|
+
map_content_chunk(content["id"], content["key"], chunk)
|
|
213
|
+
for chunk in content["chunks"]
|
|
214
|
+
],
|
|
210
215
|
created_at=content["createdAt"],
|
|
211
216
|
updated_at=content["updatedAt"],
|
|
217
|
+
ingestion_state=content.get("ingestionState"),
|
|
218
|
+
expired_at=content.get("expiredAt"),
|
|
212
219
|
)
|
|
213
220
|
|
|
214
221
|
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# Data Extraction Module
|
|
2
|
+
|
|
3
|
+
This module provides a flexible framework for extracting structured data from text using language models. It supports both basic and augmented data extraction capabilities.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The module consists of two main components:
|
|
8
|
+
|
|
9
|
+
1. **Basic Data Extraction**: Uses language models to extract structured data from text based on a provided schema.
|
|
10
|
+
2. **Augmented Data Extraction**: Extends basic extraction by adding extra fields to the output schema while maintaining the original data structure.
|
|
11
|
+
|
|
12
|
+
## Components
|
|
13
|
+
|
|
14
|
+
### Base Classes
|
|
15
|
+
|
|
16
|
+
- `BaseDataExtractor`: Abstract base class that defines the interface for data extraction
|
|
17
|
+
- `BaseDataExtractionResult`: Generic base class for extraction results
|
|
18
|
+
|
|
19
|
+
### Basic Extraction
|
|
20
|
+
|
|
21
|
+
- `StructuredOutputDataExtractor`: Implements basic data extraction using language models
|
|
22
|
+
- `StructuredOutputDataExtractorConfig`: Configuration for the basic extractor
|
|
23
|
+
|
|
24
|
+
### Augmented Extraction
|
|
25
|
+
|
|
26
|
+
- `AugmentedDataExtractor`: Extends basic extraction with additional fields
|
|
27
|
+
- `AugmentedDataExtractionResult`: Result type for augmented extraction
|
|
28
|
+
|
|
29
|
+
## Usage Examples
|
|
30
|
+
|
|
31
|
+
### Basic Data Extraction
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from pydantic import BaseModel
|
|
35
|
+
from unique_toolkit._common.data_extraction import StructuredOutputDataExtractor, StructuredOutputDataExtractorConfig
|
|
36
|
+
from unique_toolkit import LanguageModelService
|
|
37
|
+
|
|
38
|
+
# Define your schema
|
|
39
|
+
class PersonInfo(BaseModel):
|
|
40
|
+
name: str
|
|
41
|
+
age: int
|
|
42
|
+
occupation: str
|
|
43
|
+
|
|
44
|
+
# Create the extractor
|
|
45
|
+
config = StructuredOutputDataExtractorConfig()
|
|
46
|
+
lm_service = LanguageModelService() # Configure as needed
|
|
47
|
+
extractor = StructuredOutputDataExtractor(config, lm_service)
|
|
48
|
+
|
|
49
|
+
# Extract data
|
|
50
|
+
text = "John is 30 years old and works as a software engineer."
|
|
51
|
+
result = await extractor.extract_data_from_text(text, PersonInfo)
|
|
52
|
+
print(result.data) # PersonInfo(name="John", age=30, occupation="software engineer")
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Augmented Data Extraction
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from pydantic import BaseModel, Field
|
|
59
|
+
from _common.data_extraction import AugmentedDataExtractor, StructuredOutputDataExtractor
|
|
60
|
+
|
|
61
|
+
# Define your base schema
|
|
62
|
+
class PersonInfo(BaseModel):
|
|
63
|
+
name: str
|
|
64
|
+
age: int
|
|
65
|
+
|
|
66
|
+
# Create base extractor
|
|
67
|
+
base_extractor = StructuredOutputDataExtractor(...)
|
|
68
|
+
|
|
69
|
+
# Create augmented extractor with confidence scores
|
|
70
|
+
augmented_extractor = AugmentedDataExtractor(
|
|
71
|
+
base_extractor,
|
|
72
|
+
confidence=float,
|
|
73
|
+
source=("extracted", Field(description="Source of the information"))
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Extract data
|
|
77
|
+
text = "John is 30 years old."
|
|
78
|
+
result = await augmented_extractor.extract_data_from_text(text, PersonInfo)
|
|
79
|
+
print(result.data) # Original PersonInfo
|
|
80
|
+
print(result.augmented_data) # Contains additional fields
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Configuration
|
|
84
|
+
|
|
85
|
+
The `StructuredOutputDataExtractorConfig` allows customization of:
|
|
86
|
+
|
|
87
|
+
- Language model selection
|
|
88
|
+
- System and user prompt templates
|
|
89
|
+
- Schema enforcement settings
|
|
90
|
+
|
|
91
|
+
## Best Practices
|
|
92
|
+
|
|
93
|
+
1. Always define clear Pydantic models for your extraction schemas
|
|
94
|
+
2. Use augmented extraction when you need additional metadata
|
|
95
|
+
3. Consider using strict mode for augmented extraction when you want to enforce schema compliance
|
|
96
|
+
4. Customize prompts for better extraction results in specific domains
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from unique_toolkit.data_extraction.augmented import AugmentedDataExtractor
|
|
2
|
+
from unique_toolkit.data_extraction.basic import (
|
|
3
|
+
StructuredOutputDataExtractor,
|
|
4
|
+
StructuredOutputDataExtractorConfig,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"StructuredOutputDataExtractor",
|
|
9
|
+
"StructuredOutputDataExtractorConfig",
|
|
10
|
+
"AugmentedDataExtractor",
|
|
11
|
+
]
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from docxtpl.template import Any
|
|
2
|
+
from pydantic import BaseModel, create_model
|
|
3
|
+
from pydantic.alias_generators import to_pascal
|
|
4
|
+
from pydantic.fields import FieldInfo
|
|
5
|
+
from typing_extensions import override
|
|
6
|
+
|
|
7
|
+
from unique_toolkit.data_extraction.base import (
|
|
8
|
+
BaseDataExtractionResult,
|
|
9
|
+
BaseDataExtractor,
|
|
10
|
+
ExtractionSchema,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _build_augmented_model_for_field(
|
|
15
|
+
field_name: str,
|
|
16
|
+
field_type: Any | tuple[Any, FieldInfo],
|
|
17
|
+
strict: bool = False,
|
|
18
|
+
**extra_fields: Any | tuple[Any, FieldInfo],
|
|
19
|
+
) -> type[BaseModel]:
|
|
20
|
+
camelized_field_name = to_pascal(field_name)
|
|
21
|
+
|
|
22
|
+
fields = {
|
|
23
|
+
**extra_fields,
|
|
24
|
+
field_name: field_type,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
return create_model(
|
|
28
|
+
f"{camelized_field_name}Value",
|
|
29
|
+
**fields, # type: ignore
|
|
30
|
+
__config__={"extra": "forbid" if strict else "ignore"},
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AugmentedDataExtractionResult(BaseDataExtractionResult[ExtractionSchema]):
|
|
35
|
+
"""
|
|
36
|
+
Result of data extraction from text using an augmented schema.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
augmented_data: BaseModel
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class AugmentedDataExtractor(BaseDataExtractor):
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
base_data_extractor: BaseDataExtractor,
|
|
46
|
+
strict: bool = False,
|
|
47
|
+
**extra_fields: Any | tuple[Any, FieldInfo],
|
|
48
|
+
):
|
|
49
|
+
self._base_data_extractor = base_data_extractor
|
|
50
|
+
self._extra_fields = extra_fields
|
|
51
|
+
self._strict = strict
|
|
52
|
+
|
|
53
|
+
def _prepare_schema(self, schema: type[ExtractionSchema]) -> type[BaseModel]:
|
|
54
|
+
fields = {}
|
|
55
|
+
|
|
56
|
+
for field_name, field_type in schema.model_fields.items():
|
|
57
|
+
wrapped_field = _build_augmented_model_for_field(
|
|
58
|
+
field_name,
|
|
59
|
+
(field_type.annotation, field_type),
|
|
60
|
+
strict=self._strict,
|
|
61
|
+
**self._extra_fields,
|
|
62
|
+
)
|
|
63
|
+
fields[field_name] = wrapped_field
|
|
64
|
+
|
|
65
|
+
return create_model(
|
|
66
|
+
schema.__name__,
|
|
67
|
+
**fields,
|
|
68
|
+
__config__={"extra": "forbid" if self._strict else "ignore"},
|
|
69
|
+
__doc__=schema.__doc__,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def _extract_output(
|
|
73
|
+
self, llm_output: BaseModel, schema: type[ExtractionSchema]
|
|
74
|
+
) -> ExtractionSchema:
|
|
75
|
+
output_data = {
|
|
76
|
+
field_name: getattr(value, field_name) for field_name, value in llm_output
|
|
77
|
+
}
|
|
78
|
+
return schema.model_validate(output_data)
|
|
79
|
+
|
|
80
|
+
@override
|
|
81
|
+
async def extract_data_from_text(
|
|
82
|
+
self, text: str, schema: type[ExtractionSchema]
|
|
83
|
+
) -> AugmentedDataExtractionResult[ExtractionSchema]:
|
|
84
|
+
model_with_extra_fields = self._prepare_schema(schema)
|
|
85
|
+
augmented_data = (
|
|
86
|
+
await self._base_data_extractor.extract_data_from_text(
|
|
87
|
+
text, model_with_extra_fields
|
|
88
|
+
)
|
|
89
|
+
).data
|
|
90
|
+
return AugmentedDataExtractionResult(
|
|
91
|
+
data=self._extract_output(augmented_data, schema),
|
|
92
|
+
augmented_data=augmented_data,
|
|
93
|
+
)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Generic, TypeVar
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
ExtractionSchema = TypeVar("ExtractionSchema", bound=BaseModel)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseDataExtractionResult(BaseModel, Generic[ExtractionSchema]):
|
|
10
|
+
"""
|
|
11
|
+
Base class for data extraction results.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
data: ExtractionSchema
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseDataExtractor(ABC):
|
|
18
|
+
"""
|
|
19
|
+
Extract structured data from text.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
async def extract_data_from_text(
|
|
24
|
+
self, text: str, schema: type[ExtractionSchema]
|
|
25
|
+
) -> BaseDataExtractionResult[ExtractionSchema]: ...
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from unique_toolkit.data_extraction.basic.config import (
|
|
2
|
+
StructuredOutputDataExtractorConfig,
|
|
3
|
+
)
|
|
4
|
+
from unique_toolkit.data_extraction.basic.service import (
|
|
5
|
+
StructuredOutputDataExtractor,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"StructuredOutputDataExtractorConfig",
|
|
10
|
+
"StructuredOutputDataExtractor",
|
|
11
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
from unique_toolkit._common.pydantic_helpers import get_configuration_dict
|
|
4
|
+
from unique_toolkit._common.validators import LMI, get_LMI_default_field
|
|
5
|
+
from unique_toolkit.data_extraction.basic.prompt import (
|
|
6
|
+
DEFAULT_DATA_EXTRACTION_SYSTEM_PROMPT,
|
|
7
|
+
DEFAULT_DATA_EXTRACTION_USER_PROMPT,
|
|
8
|
+
)
|
|
9
|
+
from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class StructuredOutputDataExtractorConfig(BaseModel):
|
|
13
|
+
model_config = get_configuration_dict()
|
|
14
|
+
|
|
15
|
+
language_model: LMI = get_LMI_default_field(DEFAULT_GPT_4o)
|
|
16
|
+
structured_output_enforce_schema: bool = False
|
|
17
|
+
system_prompt_template: str = DEFAULT_DATA_EXTRACTION_SYSTEM_PROMPT
|
|
18
|
+
user_prompt_template: str = DEFAULT_DATA_EXTRACTION_USER_PROMPT
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
DEFAULT_DATA_EXTRACTION_SYSTEM_PROMPT = """
|
|
2
|
+
You are a thorough and accurate expert in data processing.
|
|
3
|
+
|
|
4
|
+
You will be given some text and an output schema, describing what needs to be extracted from the text.
|
|
5
|
+
You will need to extract the data from the text and return it in the output schema.
|
|
6
|
+
""".strip()
|
|
7
|
+
|
|
8
|
+
DEFAULT_DATA_EXTRACTION_USER_PROMPT = """
|
|
9
|
+
Here is the text to extract data from:
|
|
10
|
+
{{ text }}
|
|
11
|
+
|
|
12
|
+
Please thoroughly extract the data from the text and return it in the output schema.
|
|
13
|
+
""".strip()
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from typing_extensions import override
|
|
2
|
+
|
|
3
|
+
from unique_toolkit._common.utils.jinja.render import render_template
|
|
4
|
+
from unique_toolkit.data_extraction.base import (
|
|
5
|
+
BaseDataExtractionResult,
|
|
6
|
+
BaseDataExtractor,
|
|
7
|
+
ExtractionSchema,
|
|
8
|
+
)
|
|
9
|
+
from unique_toolkit.data_extraction.basic.config import (
|
|
10
|
+
StructuredOutputDataExtractorConfig,
|
|
11
|
+
)
|
|
12
|
+
from unique_toolkit.language_model import LanguageModelService
|
|
13
|
+
from unique_toolkit.language_model.builder import MessagesBuilder
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class StructuredOutputDataExtractor(BaseDataExtractor):
|
|
17
|
+
"""
|
|
18
|
+
Basic Structured Output Data Extraction.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
config: StructuredOutputDataExtractorConfig,
|
|
24
|
+
language_model_service: LanguageModelService,
|
|
25
|
+
):
|
|
26
|
+
self._config = config
|
|
27
|
+
self._language_model_service = language_model_service
|
|
28
|
+
|
|
29
|
+
@override
|
|
30
|
+
async def extract_data_from_text(
|
|
31
|
+
self, text: str, schema: type[ExtractionSchema]
|
|
32
|
+
) -> BaseDataExtractionResult[ExtractionSchema]:
|
|
33
|
+
messages_builder = (
|
|
34
|
+
MessagesBuilder()
|
|
35
|
+
.system_message_append(self._config.system_prompt_template)
|
|
36
|
+
.user_message_append(
|
|
37
|
+
render_template(
|
|
38
|
+
self._config.user_prompt_template,
|
|
39
|
+
{
|
|
40
|
+
"text": text,
|
|
41
|
+
},
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
)
|
|
45
|
+
response = await self._language_model_service.complete_async(
|
|
46
|
+
messages=messages_builder.build(),
|
|
47
|
+
model_name=self._config.language_model.name,
|
|
48
|
+
structured_output_model=schema,
|
|
49
|
+
temperature=0.0,
|
|
50
|
+
structured_output_enforce_schema=self._config.structured_output_enforce_schema,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return BaseDataExtractionResult(
|
|
54
|
+
data=schema.model_validate(response.choices[0].message.parsed),
|
|
55
|
+
)
|