unique_toolkit 0.7.9__py3-none-any.whl → 1.33.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. unique_toolkit/__init__.py +36 -3
  2. unique_toolkit/_common/api_calling/human_verification_manager.py +357 -0
  3. unique_toolkit/_common/base_model_type_attribute.py +303 -0
  4. unique_toolkit/_common/chunk_relevancy_sorter/config.py +49 -0
  5. unique_toolkit/_common/chunk_relevancy_sorter/exception.py +5 -0
  6. unique_toolkit/_common/chunk_relevancy_sorter/schemas.py +46 -0
  7. unique_toolkit/_common/chunk_relevancy_sorter/service.py +374 -0
  8. unique_toolkit/_common/chunk_relevancy_sorter/tests/test_service.py +275 -0
  9. unique_toolkit/_common/default_language_model.py +12 -0
  10. unique_toolkit/_common/docx_generator/__init__.py +7 -0
  11. unique_toolkit/_common/docx_generator/config.py +12 -0
  12. unique_toolkit/_common/docx_generator/schemas.py +80 -0
  13. unique_toolkit/_common/docx_generator/service.py +225 -0
  14. unique_toolkit/_common/docx_generator/template/Doc Template.docx +0 -0
  15. unique_toolkit/_common/endpoint_builder.py +368 -0
  16. unique_toolkit/_common/endpoint_requestor.py +480 -0
  17. unique_toolkit/_common/exception.py +24 -0
  18. unique_toolkit/_common/experimental/endpoint_builder.py +368 -0
  19. unique_toolkit/_common/experimental/endpoint_requestor.py +488 -0
  20. unique_toolkit/_common/feature_flags/schema.py +9 -0
  21. unique_toolkit/_common/pydantic/rjsf_tags.py +936 -0
  22. unique_toolkit/_common/pydantic_helpers.py +174 -0
  23. unique_toolkit/_common/referencing.py +53 -0
  24. unique_toolkit/_common/string_utilities.py +140 -0
  25. unique_toolkit/_common/tests/test_referencing.py +521 -0
  26. unique_toolkit/_common/tests/test_string_utilities.py +506 -0
  27. unique_toolkit/_common/token/image_token_counting.py +67 -0
  28. unique_toolkit/_common/token/token_counting.py +204 -0
  29. unique_toolkit/_common/utils/__init__.py +1 -0
  30. unique_toolkit/_common/utils/files.py +43 -0
  31. unique_toolkit/_common/utils/image/encode.py +25 -0
  32. unique_toolkit/_common/utils/jinja/helpers.py +10 -0
  33. unique_toolkit/_common/utils/jinja/render.py +18 -0
  34. unique_toolkit/_common/utils/jinja/schema.py +65 -0
  35. unique_toolkit/_common/utils/jinja/utils.py +80 -0
  36. unique_toolkit/_common/utils/structured_output/__init__.py +1 -0
  37. unique_toolkit/_common/utils/structured_output/schema.py +5 -0
  38. unique_toolkit/_common/utils/write_configuration.py +51 -0
  39. unique_toolkit/_common/validators.py +101 -4
  40. unique_toolkit/agentic/__init__.py +1 -0
  41. unique_toolkit/agentic/debug_info_manager/debug_info_manager.py +28 -0
  42. unique_toolkit/agentic/debug_info_manager/test/test_debug_info_manager.py +278 -0
  43. unique_toolkit/agentic/evaluation/config.py +36 -0
  44. unique_toolkit/{evaluators → agentic/evaluation}/context_relevancy/prompts.py +25 -0
  45. unique_toolkit/agentic/evaluation/context_relevancy/schema.py +80 -0
  46. unique_toolkit/agentic/evaluation/context_relevancy/service.py +273 -0
  47. unique_toolkit/agentic/evaluation/evaluation_manager.py +218 -0
  48. unique_toolkit/agentic/evaluation/hallucination/constants.py +61 -0
  49. unique_toolkit/agentic/evaluation/hallucination/hallucination_evaluation.py +112 -0
  50. unique_toolkit/{evaluators → agentic/evaluation}/hallucination/prompts.py +1 -1
  51. unique_toolkit/{evaluators → agentic/evaluation}/hallucination/service.py +20 -16
  52. unique_toolkit/{evaluators → agentic/evaluation}/hallucination/utils.py +32 -21
  53. unique_toolkit/{evaluators → agentic/evaluation}/output_parser.py +20 -2
  54. unique_toolkit/{evaluators → agentic/evaluation}/schemas.py +27 -7
  55. unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +253 -0
  56. unique_toolkit/agentic/evaluation/tests/test_output_parser.py +87 -0
  57. unique_toolkit/agentic/history_manager/history_construction_with_contents.py +298 -0
  58. unique_toolkit/agentic/history_manager/history_manager.py +241 -0
  59. unique_toolkit/agentic/history_manager/loop_token_reducer.py +484 -0
  60. unique_toolkit/agentic/history_manager/utils.py +96 -0
  61. unique_toolkit/agentic/message_log_manager/__init__.py +5 -0
  62. unique_toolkit/agentic/message_log_manager/service.py +93 -0
  63. unique_toolkit/agentic/postprocessor/postprocessor_manager.py +212 -0
  64. unique_toolkit/agentic/reference_manager/reference_manager.py +103 -0
  65. unique_toolkit/agentic/responses_api/__init__.py +19 -0
  66. unique_toolkit/agentic/responses_api/postprocessors/code_display.py +71 -0
  67. unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +297 -0
  68. unique_toolkit/agentic/responses_api/stream_handler.py +15 -0
  69. unique_toolkit/agentic/short_term_memory_manager/persistent_short_term_memory_manager.py +141 -0
  70. unique_toolkit/agentic/thinking_manager/thinking_manager.py +103 -0
  71. unique_toolkit/agentic/tools/__init__.py +1 -0
  72. unique_toolkit/agentic/tools/a2a/__init__.py +36 -0
  73. unique_toolkit/agentic/tools/a2a/config.py +17 -0
  74. unique_toolkit/agentic/tools/a2a/evaluation/__init__.py +15 -0
  75. unique_toolkit/agentic/tools/a2a/evaluation/_utils.py +66 -0
  76. unique_toolkit/agentic/tools/a2a/evaluation/config.py +55 -0
  77. unique_toolkit/agentic/tools/a2a/evaluation/evaluator.py +260 -0
  78. unique_toolkit/agentic/tools/a2a/evaluation/summarization_user_message.j2 +9 -0
  79. unique_toolkit/agentic/tools/a2a/manager.py +55 -0
  80. unique_toolkit/agentic/tools/a2a/postprocessing/__init__.py +21 -0
  81. unique_toolkit/agentic/tools/a2a/postprocessing/_display_utils.py +240 -0
  82. unique_toolkit/agentic/tools/a2a/postprocessing/_ref_utils.py +84 -0
  83. unique_toolkit/agentic/tools/a2a/postprocessing/config.py +78 -0
  84. unique_toolkit/agentic/tools/a2a/postprocessing/display.py +264 -0
  85. unique_toolkit/agentic/tools/a2a/postprocessing/references.py +101 -0
  86. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_display.py +421 -0
  87. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_display_utils.py +2103 -0
  88. unique_toolkit/agentic/tools/a2a/postprocessing/test/test_ref_utils.py +603 -0
  89. unique_toolkit/agentic/tools/a2a/prompts.py +46 -0
  90. unique_toolkit/agentic/tools/a2a/response_watcher/__init__.py +6 -0
  91. unique_toolkit/agentic/tools/a2a/response_watcher/service.py +91 -0
  92. unique_toolkit/agentic/tools/a2a/tool/__init__.py +4 -0
  93. unique_toolkit/agentic/tools/a2a/tool/_memory.py +26 -0
  94. unique_toolkit/agentic/tools/a2a/tool/_schema.py +9 -0
  95. unique_toolkit/agentic/tools/a2a/tool/config.py +158 -0
  96. unique_toolkit/agentic/tools/a2a/tool/service.py +393 -0
  97. unique_toolkit/agentic/tools/agent_chunks_hanlder.py +65 -0
  98. unique_toolkit/agentic/tools/config.py +128 -0
  99. unique_toolkit/agentic/tools/factory.py +44 -0
  100. unique_toolkit/agentic/tools/mcp/__init__.py +4 -0
  101. unique_toolkit/agentic/tools/mcp/manager.py +71 -0
  102. unique_toolkit/agentic/tools/mcp/models.py +28 -0
  103. unique_toolkit/agentic/tools/mcp/tool_wrapper.py +234 -0
  104. unique_toolkit/agentic/tools/openai_builtin/__init__.py +11 -0
  105. unique_toolkit/agentic/tools/openai_builtin/base.py +46 -0
  106. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/__init__.py +8 -0
  107. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/config.py +88 -0
  108. unique_toolkit/agentic/tools/openai_builtin/code_interpreter/service.py +250 -0
  109. unique_toolkit/agentic/tools/openai_builtin/manager.py +79 -0
  110. unique_toolkit/agentic/tools/schemas.py +145 -0
  111. unique_toolkit/agentic/tools/test/test_mcp_manager.py +536 -0
  112. unique_toolkit/agentic/tools/test/test_tool_progress_reporter.py +445 -0
  113. unique_toolkit/agentic/tools/tool.py +187 -0
  114. unique_toolkit/agentic/tools/tool_manager.py +492 -0
  115. unique_toolkit/agentic/tools/tool_progress_reporter.py +285 -0
  116. unique_toolkit/agentic/tools/utils/__init__.py +19 -0
  117. unique_toolkit/agentic/tools/utils/execution/__init__.py +1 -0
  118. unique_toolkit/agentic/tools/utils/execution/execution.py +286 -0
  119. unique_toolkit/agentic/tools/utils/source_handling/__init__.py +0 -0
  120. unique_toolkit/agentic/tools/utils/source_handling/schema.py +21 -0
  121. unique_toolkit/agentic/tools/utils/source_handling/source_formatting.py +207 -0
  122. unique_toolkit/agentic/tools/utils/source_handling/tests/test_source_formatting.py +216 -0
  123. unique_toolkit/app/__init__.py +9 -0
  124. unique_toolkit/app/dev_util.py +180 -0
  125. unique_toolkit/app/fast_api_factory.py +131 -0
  126. unique_toolkit/app/init_sdk.py +32 -1
  127. unique_toolkit/app/schemas.py +206 -31
  128. unique_toolkit/app/unique_settings.py +367 -0
  129. unique_toolkit/app/webhook.py +77 -0
  130. unique_toolkit/chat/__init__.py +8 -1
  131. unique_toolkit/chat/deprecated/service.py +232 -0
  132. unique_toolkit/chat/functions.py +648 -78
  133. unique_toolkit/chat/rendering.py +34 -0
  134. unique_toolkit/chat/responses_api.py +461 -0
  135. unique_toolkit/chat/schemas.py +134 -2
  136. unique_toolkit/chat/service.py +115 -767
  137. unique_toolkit/content/functions.py +353 -8
  138. unique_toolkit/content/schemas.py +128 -15
  139. unique_toolkit/content/service.py +321 -45
  140. unique_toolkit/content/smart_rules.py +301 -0
  141. unique_toolkit/content/utils.py +10 -3
  142. unique_toolkit/data_extraction/README.md +96 -0
  143. unique_toolkit/data_extraction/__init__.py +11 -0
  144. unique_toolkit/data_extraction/augmented/__init__.py +5 -0
  145. unique_toolkit/data_extraction/augmented/service.py +93 -0
  146. unique_toolkit/data_extraction/base.py +25 -0
  147. unique_toolkit/data_extraction/basic/__init__.py +11 -0
  148. unique_toolkit/data_extraction/basic/config.py +18 -0
  149. unique_toolkit/data_extraction/basic/prompt.py +13 -0
  150. unique_toolkit/data_extraction/basic/service.py +55 -0
  151. unique_toolkit/embedding/service.py +103 -12
  152. unique_toolkit/framework_utilities/__init__.py +1 -0
  153. unique_toolkit/framework_utilities/langchain/__init__.py +10 -0
  154. unique_toolkit/framework_utilities/langchain/client.py +71 -0
  155. unique_toolkit/framework_utilities/langchain/history.py +19 -0
  156. unique_toolkit/framework_utilities/openai/__init__.py +6 -0
  157. unique_toolkit/framework_utilities/openai/client.py +84 -0
  158. unique_toolkit/framework_utilities/openai/message_builder.py +229 -0
  159. unique_toolkit/framework_utilities/utils.py +23 -0
  160. unique_toolkit/language_model/__init__.py +3 -0
  161. unique_toolkit/language_model/_responses_api_utils.py +93 -0
  162. unique_toolkit/language_model/builder.py +27 -11
  163. unique_toolkit/language_model/default_language_model.py +3 -0
  164. unique_toolkit/language_model/functions.py +345 -43
  165. unique_toolkit/language_model/infos.py +1288 -46
  166. unique_toolkit/language_model/reference.py +242 -0
  167. unique_toolkit/language_model/schemas.py +481 -49
  168. unique_toolkit/language_model/service.py +229 -28
  169. unique_toolkit/protocols/support.py +145 -0
  170. unique_toolkit/services/__init__.py +7 -0
  171. unique_toolkit/services/chat_service.py +1631 -0
  172. unique_toolkit/services/knowledge_base.py +1094 -0
  173. unique_toolkit/short_term_memory/service.py +178 -41
  174. unique_toolkit/smart_rules/__init__.py +0 -0
  175. unique_toolkit/smart_rules/compile.py +56 -0
  176. unique_toolkit/test_utilities/events.py +197 -0
  177. unique_toolkit-1.33.3.dist-info/METADATA +1145 -0
  178. unique_toolkit-1.33.3.dist-info/RECORD +205 -0
  179. unique_toolkit/evaluators/__init__.py +0 -1
  180. unique_toolkit/evaluators/config.py +0 -35
  181. unique_toolkit/evaluators/constants.py +0 -1
  182. unique_toolkit/evaluators/context_relevancy/constants.py +0 -32
  183. unique_toolkit/evaluators/context_relevancy/service.py +0 -53
  184. unique_toolkit/evaluators/context_relevancy/utils.py +0 -142
  185. unique_toolkit/evaluators/hallucination/constants.py +0 -41
  186. unique_toolkit-0.7.9.dist-info/METADATA +0 -413
  187. unique_toolkit-0.7.9.dist-info/RECORD +0 -64
  188. /unique_toolkit/{evaluators → agentic/evaluation}/exception.py +0 -0
  189. {unique_toolkit-0.7.9.dist-info → unique_toolkit-1.33.3.dist-info}/LICENSE +0 -0
  190. {unique_toolkit-0.7.9.dist-info → unique_toolkit-1.33.3.dist-info}/WHEEL +0 -0
@@ -0,0 +1,301 @@
1
+ import re
2
+ from datetime import datetime, timedelta, timezone
3
+ from enum import Enum
4
+ from typing import Any, Dict, List, Mapping, Self, Union
5
+
6
+ from pydantic import AliasChoices, BaseModel, Field
7
+ from pydantic.config import ConfigDict
8
+
9
+
10
+ class Operator(str, Enum):
11
+ EQUALS = "equals"
12
+ NOT_EQUALS = "notEquals"
13
+ GREATER_THAN = "greaterThan"
14
+ GREATER_THAN_OR_EQUAL = "greaterThanOrEqual"
15
+ LESS_THAN = "lessThan"
16
+ LESS_THAN_OR_EQUAL = "lessThanOrEqual"
17
+ IN = "in"
18
+ NOT_IN = "notIn"
19
+ CONTAINS = "contains"
20
+ NOT_CONTAINS = "notContains"
21
+ IS_NULL = "isNull"
22
+ IS_NOT_NULL = "isNotNull"
23
+ IS_EMPTY = "isEmpty"
24
+ IS_NOT_EMPTY = "isNotEmpty"
25
+ NESTED = "nested"
26
+
27
+
28
+ class BaseStatement(BaseModel):
29
+ model_config = ConfigDict(serialize_by_alias=True)
30
+
31
+ def with_variables(
32
+ self,
33
+ user_metadata: Mapping[str, Union[str, int, bool]],
34
+ tool_parameters: Mapping[str, Union[str, int, bool]],
35
+ ) -> Self:
36
+ return self._fill_in_variables(user_metadata, tool_parameters)
37
+
38
+ def is_compiled(self) -> bool:
39
+ # Serialize the object to json string
40
+ json_str = self.model_dump_json()
41
+ # Check if the json string has <T> or <T+> or <T-> or <toolParameters or <userMetadata
42
+ return (
43
+ "<T>" in json_str
44
+ or "<T+" in json_str
45
+ or "<T-" in json_str
46
+ or "<toolParameters" in json_str
47
+ or "<userMetadata" in json_str
48
+ )
49
+
50
+ def _fill_in_variables(
51
+ self,
52
+ user_metadata: Mapping[str, Union[str, int, bool]],
53
+ tool_parameters: Mapping[str, Union[str, int, bool]],
54
+ ) -> Self:
55
+ return self.model_copy()
56
+
57
+
58
+ class Statement(BaseStatement):
59
+ operator: Operator
60
+ value: Union[str, int, bool, list[str], "AndStatement", "OrStatement"]
61
+ path: List[str] = Field(default_factory=list)
62
+
63
+ def _fill_in_variables(
64
+ self,
65
+ user_metadata: Mapping[str, Union[str, int, bool]],
66
+ tool_parameters: Mapping[str, Union[str, int, bool]],
67
+ ) -> Self:
68
+ new_stmt = self.model_copy()
69
+ new_stmt.value = eval_operator(self, user_metadata, tool_parameters)
70
+ return new_stmt
71
+
72
+
73
+ class AndStatement(BaseStatement):
74
+ and_list: List[Union["Statement", "AndStatement", "OrStatement"]] = Field(
75
+ validation_alias=AliasChoices("and", "and_list"), serialization_alias="and"
76
+ )
77
+
78
+ def _fill_in_variables(
79
+ self,
80
+ user_metadata: Mapping[str, Union[str, int, bool]],
81
+ tool_parameters: Mapping[str, Union[str, int, bool]],
82
+ ) -> Self:
83
+ new_stmt = self.model_copy()
84
+ new_stmt.and_list = [
85
+ sub_query._fill_in_variables(user_metadata, tool_parameters)
86
+ for sub_query in self.and_list
87
+ ]
88
+ return new_stmt
89
+
90
+
91
+ class OrStatement(BaseStatement):
92
+ or_list: List[Union["Statement", "AndStatement", "OrStatement"]] = Field(
93
+ validation_alias=AliasChoices("or", "or_list"), serialization_alias="or"
94
+ )
95
+
96
+ def _fill_in_variables(
97
+ self,
98
+ user_metadata: Mapping[str, Union[str, int, bool]],
99
+ tool_parameters: Mapping[str, Union[str, int, bool]],
100
+ ) -> Self:
101
+ new_stmt = self.model_copy()
102
+ new_stmt.or_list = [
103
+ sub_query._fill_in_variables(user_metadata, tool_parameters)
104
+ for sub_query in self.or_list
105
+ ]
106
+ return new_stmt
107
+
108
+
109
+ # Update the forward references
110
+ Statement.model_rebuild()
111
+ AndStatement.model_rebuild()
112
+ OrStatement.model_rebuild()
113
+
114
+
115
+ UniqueQL = Union[Statement, AndStatement, OrStatement]
116
+
117
+
118
+ def is_array_of_strings(value: Any) -> bool:
119
+ return isinstance(value, list) and all(isinstance(item, str) for item in value)
120
+
121
+
122
+ def eval_operator(
123
+ query: Statement,
124
+ user_metadata: Mapping[str, Union[str, int, bool]],
125
+ tool_parameters: Mapping[str, Union[str, int, bool]],
126
+ ) -> Any:
127
+ if query.operator in [
128
+ Operator.EQUALS,
129
+ Operator.NOT_EQUALS,
130
+ Operator.GREATER_THAN,
131
+ Operator.GREATER_THAN_OR_EQUAL,
132
+ Operator.LESS_THAN,
133
+ Operator.LESS_THAN_OR_EQUAL,
134
+ Operator.CONTAINS,
135
+ Operator.NOT_CONTAINS,
136
+ ]:
137
+ return binary_operator(query.value, user_metadata, tool_parameters)
138
+ elif query.operator in [Operator.IS_NULL, Operator.IS_NOT_NULL]:
139
+ return null_operator(query.value, user_metadata, tool_parameters)
140
+ elif query.operator in [Operator.IS_EMPTY, Operator.IS_NOT_EMPTY]:
141
+ return empty_operator(query.operator, user_metadata, tool_parameters)
142
+ elif query.operator == Operator.NESTED:
143
+ return eval_nested_operator(query.value, user_metadata, tool_parameters)
144
+ elif query.operator in [Operator.IN, Operator.NOT_IN]:
145
+ return array_operator(query.value, user_metadata, tool_parameters)
146
+ else:
147
+ raise ValueError(f"Operator {query.operator} not supported")
148
+
149
+
150
+ def eval_nested_operator(
151
+ value: Any,
152
+ user_metadata: Mapping[str, Union[str, int, bool]],
153
+ tool_parameters: Mapping[str, Union[str, int, bool]],
154
+ ) -> Union[AndStatement, OrStatement]:
155
+ if not isinstance(value, (AndStatement, OrStatement)):
156
+ raise ValueError("Nested operator must be an AndStatement or OrStatement")
157
+ return value._fill_in_variables(user_metadata, tool_parameters)
158
+
159
+
160
+ def binary_operator(
161
+ value: Any,
162
+ user_metadata: Mapping[str, Union[str, int, bool]],
163
+ tool_parameters: Mapping[str, Union[str, int, bool]],
164
+ ) -> Any:
165
+ return replace_variables(value, user_metadata, tool_parameters)
166
+
167
+
168
+ def array_operator(
169
+ value: Any,
170
+ user_metadata: Mapping[str, Union[str, int, bool]],
171
+ tool_parameters: Mapping[str, Union[str, int, bool]],
172
+ ) -> Any:
173
+ if is_array_of_strings(value):
174
+ return [
175
+ replace_variables(item, user_metadata, tool_parameters) for item in value
176
+ ]
177
+ return value
178
+
179
+
180
+ def null_operator(
181
+ value: Any,
182
+ user_metadata: Mapping[str, Union[str, int, bool]],
183
+ tool_parameters: Mapping[str, Union[str, int, bool]],
184
+ ) -> Any:
185
+ return value # do nothing for now. No variables to replace
186
+
187
+
188
+ def empty_operator(
189
+ operator: Operator,
190
+ user_metadata: Mapping[str, Union[str, int, bool]],
191
+ tool_parameters: Mapping[str, Union[str, int, bool]],
192
+ ) -> Any:
193
+ """Handle IS_EMPTY and IS_NOT_EMPTY operators."""
194
+ if operator == Operator.IS_EMPTY:
195
+ return ""
196
+ elif operator == Operator.IS_NOT_EMPTY:
197
+ return "not_empty"
198
+ return None
199
+
200
+
201
+ def calculate_current_date() -> str:
202
+ """Calculate current date in UTC with seconds precision."""
203
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
204
+
205
+
206
+ def calculate_earlier_date(input_str: str) -> str:
207
+ match = re.search(r"<T-(\d+)>", input_str)
208
+ if not match:
209
+ return calculate_current_date() # Return current date if no match
210
+ days = int(match.group(1))
211
+ return (datetime.now(timezone.utc) - timedelta(days=days)).isoformat(
212
+ timespec="seconds"
213
+ )
214
+
215
+
216
+ def calculate_later_date(input_str: str) -> str:
217
+ match = re.search(r"<T\+(\d+)>", input_str) # Note: escaped + in regex
218
+ if not match:
219
+ return calculate_current_date() # Return current date if no match
220
+ days = int(match.group(1))
221
+ return (datetime.now(timezone.utc) + timedelta(days=days)).isoformat(
222
+ timespec="seconds"
223
+ )
224
+
225
+
226
+ def replace_variables(
227
+ value: Any,
228
+ user_metadata: Mapping[str, Union[str, int, bool]],
229
+ tool_parameters: Mapping[str, Union[str, int, bool]],
230
+ ) -> Any:
231
+ if isinstance(value, str):
232
+ if "||" in value:
233
+ return get_fallback_values(value, user_metadata, tool_parameters)
234
+ elif value == "<T>":
235
+ return calculate_current_date()
236
+ elif "<T-" in value:
237
+ return calculate_earlier_date(value)
238
+ elif "<T+" in value:
239
+ return calculate_later_date(value)
240
+
241
+ value = replace_tool_parameters_patterns(value, tool_parameters)
242
+ value = replace_user_metadata_patterns(value, user_metadata)
243
+
244
+ if value == "":
245
+ return value
246
+ try:
247
+ return int(value)
248
+ except ValueError:
249
+ if value.lower() in ["true", "false"]:
250
+ return value.lower() == "true"
251
+ return value
252
+ return value
253
+
254
+
255
+ def replace_tool_parameters_patterns(
256
+ value: str, tool_parameters: Dict[str, Union[str, int, bool]]
257
+ ) -> str:
258
+ def replace_match(match):
259
+ param_name = match.group(1)
260
+ return str(tool_parameters.get(param_name, ""))
261
+
262
+ return re.sub(r"<toolParameters\.(\w+)>", replace_match, value)
263
+
264
+
265
+ def replace_user_metadata_patterns(
266
+ value: str, user_metadata: Dict[str, Union[str, int, bool]]
267
+ ) -> str:
268
+ def replace_match(match):
269
+ param_name = match.group(1)
270
+ return str(user_metadata.get(param_name, ""))
271
+
272
+ return re.sub(r"<userMetadata\.(\w+)>", replace_match, value)
273
+
274
+
275
+ def get_fallback_values(
276
+ value: str,
277
+ user_metadata: Mapping[str, Union[str, int, bool]],
278
+ tool_parameters: Mapping[str, Union[str, int, bool]],
279
+ ) -> Any:
280
+ values = value.split("||")
281
+ for val in values:
282
+ data = replace_variables(val, user_metadata, tool_parameters)
283
+ if data != "":
284
+ return data
285
+ return values
286
+
287
+
288
+ # Example usage:
289
+ def parse_uniqueql(json_data: Dict[str, Any]) -> UniqueQL:
290
+ if "operator" in json_data:
291
+ return Statement.model_validate(json_data)
292
+ elif "or" in json_data:
293
+ return OrStatement.model_validate(
294
+ {"or": [parse_uniqueql(item) for item in json_data["or"]]}
295
+ )
296
+ elif "and" in json_data:
297
+ return AndStatement.model_validate(
298
+ {"and": [parse_uniqueql(item) for item in json_data["and"]]}
299
+ )
300
+ else:
301
+ raise ValueError("Invalid UniqueQL format")
@@ -190,9 +190,11 @@ def count_tokens(text: str, encoding_model="cl100k_base") -> int:
190
190
  return len(encoding.encode(text))
191
191
 
192
192
 
193
- def map_content_chunk(content_chunk: dict):
193
+ def map_content_chunk(content_id: str, content_key: str, content_chunk: dict):
194
194
  return ContentChunk(
195
- id=content_chunk["id"],
195
+ id=content_id,
196
+ key=content_key,
197
+ chunk_id=content_chunk["id"],
196
198
  text=content_chunk["text"],
197
199
  start_page=content_chunk["startPage"],
198
200
  end_page=content_chunk["endPage"],
@@ -206,9 +208,14 @@ def map_content(content: dict):
206
208
  key=content["key"],
207
209
  title=content["title"],
208
210
  url=content["url"],
209
- chunks=[map_content_chunk(chunk) for chunk in content["chunks"]],
211
+ chunks=[
212
+ map_content_chunk(content["id"], content["key"], chunk)
213
+ for chunk in content["chunks"]
214
+ ],
210
215
  created_at=content["createdAt"],
211
216
  updated_at=content["updatedAt"],
217
+ ingestion_state=content.get("ingestionState"),
218
+ expired_at=content.get("expiredAt"),
212
219
  )
213
220
 
214
221
 
@@ -0,0 +1,96 @@
1
+ # Data Extraction Module
2
+
3
+ This module provides a flexible framework for extracting structured data from text using language models. It supports both basic and augmented data extraction capabilities.
4
+
5
+ ## Overview
6
+
7
+ The module consists of two main components:
8
+
9
+ 1. **Basic Data Extraction**: Uses language models to extract structured data from text based on a provided schema.
10
+ 2. **Augmented Data Extraction**: Extends basic extraction by adding extra fields to the output schema while maintaining the original data structure.
11
+
12
+ ## Components
13
+
14
+ ### Base Classes
15
+
16
+ - `BaseDataExtractor`: Abstract base class that defines the interface for data extraction
17
+ - `BaseDataExtractionResult`: Generic base class for extraction results
18
+
19
+ ### Basic Extraction
20
+
21
+ - `StructuredOutputDataExtractor`: Implements basic data extraction using language models
22
+ - `StructuredOutputDataExtractorConfig`: Configuration for the basic extractor
23
+
24
+ ### Augmented Extraction
25
+
26
+ - `AugmentedDataExtractor`: Extends basic extraction with additional fields
27
+ - `AugmentedDataExtractionResult`: Result type for augmented extraction
28
+
29
+ ## Usage Examples
30
+
31
+ ### Basic Data Extraction
32
+
33
+ ```python
34
+ from pydantic import BaseModel
35
+ from unique_toolkit._common.data_extraction import StructuredOutputDataExtractor, StructuredOutputDataExtractorConfig
36
+ from unique_toolkit import LanguageModelService
37
+
38
+ # Define your schema
39
+ class PersonInfo(BaseModel):
40
+ name: str
41
+ age: int
42
+ occupation: str
43
+
44
+ # Create the extractor
45
+ config = StructuredOutputDataExtractorConfig()
46
+ lm_service = LanguageModelService() # Configure as needed
47
+ extractor = StructuredOutputDataExtractor(config, lm_service)
48
+
49
+ # Extract data
50
+ text = "John is 30 years old and works as a software engineer."
51
+ result = await extractor.extract_data_from_text(text, PersonInfo)
52
+ print(result.data) # PersonInfo(name="John", age=30, occupation="software engineer")
53
+ ```
54
+
55
+ ### Augmented Data Extraction
56
+
57
+ ```python
58
+ from pydantic import BaseModel, Field
59
+ from _common.data_extraction import AugmentedDataExtractor, StructuredOutputDataExtractor
60
+
61
+ # Define your base schema
62
+ class PersonInfo(BaseModel):
63
+ name: str
64
+ age: int
65
+
66
+ # Create base extractor
67
+ base_extractor = StructuredOutputDataExtractor(...)
68
+
69
+ # Create augmented extractor with confidence scores
70
+ augmented_extractor = AugmentedDataExtractor(
71
+ base_extractor,
72
+ confidence=float,
73
+ source=("extracted", Field(description="Source of the information"))
74
+ )
75
+
76
+ # Extract data
77
+ text = "John is 30 years old."
78
+ result = await augmented_extractor.extract_data_from_text(text, PersonInfo)
79
+ print(result.data) # Original PersonInfo
80
+ print(result.augmented_data) # Contains additional fields
81
+ ```
82
+
83
+ ## Configuration
84
+
85
+ The `StructuredOutputDataExtractorConfig` allows customization of:
86
+
87
+ - Language model selection
88
+ - System and user prompt templates
89
+ - Schema enforcement settings
90
+
91
+ ## Best Practices
92
+
93
+ 1. Always define clear Pydantic models for your extraction schemas
94
+ 2. Use augmented extraction when you need additional metadata
95
+ 3. Consider using strict mode for augmented extraction when you want to enforce schema compliance
96
+ 4. Customize prompts for better extraction results in specific domains
@@ -0,0 +1,11 @@
1
+ from unique_toolkit.data_extraction.augmented import AugmentedDataExtractor
2
+ from unique_toolkit.data_extraction.basic import (
3
+ StructuredOutputDataExtractor,
4
+ StructuredOutputDataExtractorConfig,
5
+ )
6
+
7
+ __all__ = [
8
+ "StructuredOutputDataExtractor",
9
+ "StructuredOutputDataExtractorConfig",
10
+ "AugmentedDataExtractor",
11
+ ]
@@ -0,0 +1,5 @@
1
+ from unique_toolkit.data_extraction.augmented.service import (
2
+ AugmentedDataExtractor,
3
+ )
4
+
5
+ __all__ = ["AugmentedDataExtractor"]
@@ -0,0 +1,93 @@
1
+ from docxtpl.template import Any
2
+ from pydantic import BaseModel, create_model
3
+ from pydantic.alias_generators import to_pascal
4
+ from pydantic.fields import FieldInfo
5
+ from typing_extensions import override
6
+
7
+ from unique_toolkit.data_extraction.base import (
8
+ BaseDataExtractionResult,
9
+ BaseDataExtractor,
10
+ ExtractionSchema,
11
+ )
12
+
13
+
14
+ def _build_augmented_model_for_field(
15
+ field_name: str,
16
+ field_type: Any | tuple[Any, FieldInfo],
17
+ strict: bool = False,
18
+ **extra_fields: Any | tuple[Any, FieldInfo],
19
+ ) -> type[BaseModel]:
20
+ camelized_field_name = to_pascal(field_name)
21
+
22
+ fields = {
23
+ **extra_fields,
24
+ field_name: field_type,
25
+ }
26
+
27
+ return create_model(
28
+ f"{camelized_field_name}Value",
29
+ **fields, # type: ignore
30
+ __config__={"extra": "forbid" if strict else "ignore"},
31
+ )
32
+
33
+
34
+ class AugmentedDataExtractionResult(BaseDataExtractionResult[ExtractionSchema]):
35
+ """
36
+ Result of data extraction from text using an augmented schema.
37
+ """
38
+
39
+ augmented_data: BaseModel
40
+
41
+
42
+ class AugmentedDataExtractor(BaseDataExtractor):
43
+ def __init__(
44
+ self,
45
+ base_data_extractor: BaseDataExtractor,
46
+ strict: bool = False,
47
+ **extra_fields: Any | tuple[Any, FieldInfo],
48
+ ):
49
+ self._base_data_extractor = base_data_extractor
50
+ self._extra_fields = extra_fields
51
+ self._strict = strict
52
+
53
+ def _prepare_schema(self, schema: type[ExtractionSchema]) -> type[BaseModel]:
54
+ fields = {}
55
+
56
+ for field_name, field_type in schema.model_fields.items():
57
+ wrapped_field = _build_augmented_model_for_field(
58
+ field_name,
59
+ (field_type.annotation, field_type),
60
+ strict=self._strict,
61
+ **self._extra_fields,
62
+ )
63
+ fields[field_name] = wrapped_field
64
+
65
+ return create_model(
66
+ schema.__name__,
67
+ **fields,
68
+ __config__={"extra": "forbid" if self._strict else "ignore"},
69
+ __doc__=schema.__doc__,
70
+ )
71
+
72
+ def _extract_output(
73
+ self, llm_output: BaseModel, schema: type[ExtractionSchema]
74
+ ) -> ExtractionSchema:
75
+ output_data = {
76
+ field_name: getattr(value, field_name) for field_name, value in llm_output
77
+ }
78
+ return schema.model_validate(output_data)
79
+
80
+ @override
81
+ async def extract_data_from_text(
82
+ self, text: str, schema: type[ExtractionSchema]
83
+ ) -> AugmentedDataExtractionResult[ExtractionSchema]:
84
+ model_with_extra_fields = self._prepare_schema(schema)
85
+ augmented_data = (
86
+ await self._base_data_extractor.extract_data_from_text(
87
+ text, model_with_extra_fields
88
+ )
89
+ ).data
90
+ return AugmentedDataExtractionResult(
91
+ data=self._extract_output(augmented_data, schema),
92
+ augmented_data=augmented_data,
93
+ )
@@ -0,0 +1,25 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Generic, TypeVar
3
+
4
+ from pydantic import BaseModel
5
+
6
+ ExtractionSchema = TypeVar("ExtractionSchema", bound=BaseModel)
7
+
8
+
9
+ class BaseDataExtractionResult(BaseModel, Generic[ExtractionSchema]):
10
+ """
11
+ Base class for data extraction results.
12
+ """
13
+
14
+ data: ExtractionSchema
15
+
16
+
17
+ class BaseDataExtractor(ABC):
18
+ """
19
+ Extract structured data from text.
20
+ """
21
+
22
+ @abstractmethod
23
+ async def extract_data_from_text(
24
+ self, text: str, schema: type[ExtractionSchema]
25
+ ) -> BaseDataExtractionResult[ExtractionSchema]: ...
@@ -0,0 +1,11 @@
1
+ from unique_toolkit.data_extraction.basic.config import (
2
+ StructuredOutputDataExtractorConfig,
3
+ )
4
+ from unique_toolkit.data_extraction.basic.service import (
5
+ StructuredOutputDataExtractor,
6
+ )
7
+
8
+ __all__ = [
9
+ "StructuredOutputDataExtractorConfig",
10
+ "StructuredOutputDataExtractor",
11
+ ]
@@ -0,0 +1,18 @@
1
+ from pydantic import BaseModel
2
+
3
+ from unique_toolkit._common.pydantic_helpers import get_configuration_dict
4
+ from unique_toolkit._common.validators import LMI, get_LMI_default_field
5
+ from unique_toolkit.data_extraction.basic.prompt import (
6
+ DEFAULT_DATA_EXTRACTION_SYSTEM_PROMPT,
7
+ DEFAULT_DATA_EXTRACTION_USER_PROMPT,
8
+ )
9
+ from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
10
+
11
+
12
+ class StructuredOutputDataExtractorConfig(BaseModel):
13
+ model_config = get_configuration_dict()
14
+
15
+ language_model: LMI = get_LMI_default_field(DEFAULT_GPT_4o)
16
+ structured_output_enforce_schema: bool = False
17
+ system_prompt_template: str = DEFAULT_DATA_EXTRACTION_SYSTEM_PROMPT
18
+ user_prompt_template: str = DEFAULT_DATA_EXTRACTION_USER_PROMPT
@@ -0,0 +1,13 @@
1
+ DEFAULT_DATA_EXTRACTION_SYSTEM_PROMPT = """
2
+ You are a thorough and accurate expert in data processing.
3
+
4
+ You will be given some text and an output schema, describing what needs to be extracted from the text.
5
+ You will need to extract the data from the text and return it in the output schema.
6
+ """.strip()
7
+
8
+ DEFAULT_DATA_EXTRACTION_USER_PROMPT = """
9
+ Here is the text to extract data from:
10
+ {{ text }}
11
+
12
+ Please thoroughly extract the data from the text and return it in the output schema.
13
+ """.strip()
@@ -0,0 +1,55 @@
1
+ from typing_extensions import override
2
+
3
+ from unique_toolkit._common.utils.jinja.render import render_template
4
+ from unique_toolkit.data_extraction.base import (
5
+ BaseDataExtractionResult,
6
+ BaseDataExtractor,
7
+ ExtractionSchema,
8
+ )
9
+ from unique_toolkit.data_extraction.basic.config import (
10
+ StructuredOutputDataExtractorConfig,
11
+ )
12
+ from unique_toolkit.language_model import LanguageModelService
13
+ from unique_toolkit.language_model.builder import MessagesBuilder
14
+
15
+
16
+ class StructuredOutputDataExtractor(BaseDataExtractor):
17
+ """
18
+ Basic Structured Output Data Extraction.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ config: StructuredOutputDataExtractorConfig,
24
+ language_model_service: LanguageModelService,
25
+ ):
26
+ self._config = config
27
+ self._language_model_service = language_model_service
28
+
29
+ @override
30
+ async def extract_data_from_text(
31
+ self, text: str, schema: type[ExtractionSchema]
32
+ ) -> BaseDataExtractionResult[ExtractionSchema]:
33
+ messages_builder = (
34
+ MessagesBuilder()
35
+ .system_message_append(self._config.system_prompt_template)
36
+ .user_message_append(
37
+ render_template(
38
+ self._config.user_prompt_template,
39
+ {
40
+ "text": text,
41
+ },
42
+ )
43
+ )
44
+ )
45
+ response = await self._language_model_service.complete_async(
46
+ messages=messages_builder.build(),
47
+ model_name=self._config.language_model.name,
48
+ structured_output_model=schema,
49
+ temperature=0.0,
50
+ structured_output_enforce_schema=self._config.structured_output_enforce_schema,
51
+ )
52
+
53
+ return BaseDataExtractionResult(
54
+ data=schema.model_validate(response.choices[0].message.parsed),
55
+ )