unique_toolkit 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,282 @@
1
+ import functools
2
+ import logging
3
+ from typing import (
4
+ Awaitable,
5
+ Callable,
6
+ Generic,
7
+ Iterable,
8
+ ParamSpec,
9
+ Type,
10
+ TypeVar,
11
+ cast,
12
+ )
13
+
14
+ # Function types
15
+ P = ParamSpec("P")
16
+ R = TypeVar("R")
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class Result(Generic[R]):
23
+ def __init__(
24
+ self,
25
+ success: bool,
26
+ result: R | None = None,
27
+ exception: Exception | None = None,
28
+ ) -> None:
29
+ self._success = success
30
+ self._result = result
31
+ self._exception = exception
32
+
33
+ @property
34
+ def exception(self) -> Exception | None:
35
+ return self._exception
36
+
37
+ @property
38
+ def success(self) -> bool:
39
+ return self._success
40
+
41
+ def unpack(self, default: R | None = None) -> R:
42
+ return cast(R, self._result) if self.success else cast(R, default)
43
+
44
+ def __str__(self) -> str:
45
+ return (
46
+ f"Success: {str(self._result)}"
47
+ if self.success
48
+ else f"Failure: {str(self._exception)}"
49
+ )
50
+
51
+
52
+ class SafeTaskExecutor:
53
+ """
54
+ Execute function calls "safely": exceptions are caught and logged,
55
+ and the function result is returned as a `Result` object.
56
+
57
+ Several parameters are available to customize the behavior of the executor:
58
+ - `exceptions`: a list of exceptions that should be caught and logged
59
+ - `ignored_exceptions`: a list of exceptions that should be passed through
60
+ - `log_exceptions`: whether to log exceptions
61
+ - `log_exc_info`: whether to log exception info
62
+ - `logger`: a logger to use for logging
63
+
64
+
65
+ Usage:
66
+ ```python
67
+ executor = SafeTaskExecutor(
68
+ exceptions=(ValueError,),
69
+ ignored_exceptions=(KeyError,),
70
+ )
71
+
72
+ executor.execute(failing_function, "test")
73
+
74
+ executor.execute_async(async_failing_function, "test")
75
+ ```
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ exceptions: Iterable[Type[Exception]] = (Exception,),
81
+ ignored_exceptions: Iterable[Type[Exception]] = (),
82
+ log_exceptions: bool = True,
83
+ log_exc_info: bool = True,
84
+ logger: logging.Logger | None = None,
85
+ ) -> None:
86
+ self._exceptions = tuple(exceptions)
87
+ self._ignored_exceptions = tuple(ignored_exceptions)
88
+ self._log_exceptions = log_exceptions
89
+ self._log_exc_info = log_exc_info
90
+ self._logger = logger
91
+
92
+ def execute(
93
+ self, f: Callable[P, R], *args: P.args, **kwargs: P.kwargs
94
+ ) -> Result[R]:
95
+ try:
96
+ return Result(True, f(*args, **kwargs))
97
+ except self._exceptions as e:
98
+ if isinstance(e, self._ignored_exceptions):
99
+ raise e
100
+ if self._log_exceptions:
101
+ logger.error(f"Error in {f.__name__}: {e}", exc_info=self._log_exc_info)
102
+ return Result(False, exception=e)
103
+
104
+ async def execute_async(
105
+ self, f: Callable[P, Awaitable[R]], *args: P.args, **kwargs: P.kwargs
106
+ ) -> Result[R]:
107
+ try:
108
+ return Result(True, await f(*args, **kwargs))
109
+ except self._exceptions as e:
110
+ if isinstance(e, self._ignored_exceptions):
111
+ raise e
112
+ if self._log_exceptions:
113
+ logger.error(f"Error in {f.__name__}: {e}", exc_info=self._log_exc_info)
114
+ return Result(False, exception=e)
115
+
116
+
117
+ def safe_execute(f: Callable[P, R], *args: P.args, **kwargs: P.kwargs) -> Result[R]:
118
+ """
119
+ Execute a function call "safely": exceptions are caught and logged,
120
+ and the function result is returned as a `Result` object.
121
+
122
+ Usage:
123
+ ```python
124
+ def failing_function(a : str) -> int:
125
+ raise ValueError(a)
126
+
127
+ result = safe_execute(failing_function, "test")
128
+ print(result)
129
+ >> Failure: ValueError('test')
130
+
131
+ result.success
132
+ >> False
133
+
134
+ result.unpack()
135
+ >> None
136
+
137
+ result.exception
138
+ >> ValueError('test')
139
+
140
+ result.unpack(default=1)
141
+ >> 1
142
+ ```
143
+
144
+ ```python
145
+ def succeeding_function(a : str):
146
+ return a
147
+
148
+
149
+ result = safe_execute(succeeding_function, "test")
150
+
151
+ print(result)
152
+ >> Success: test
153
+
154
+ result.success
155
+ >> True
156
+
157
+ result.unpack()
158
+ >> 'test'
159
+
160
+ result.exception
161
+ >> None
162
+ ```
163
+ """
164
+ return SafeTaskExecutor().execute(f, *args, **kwargs)
165
+
166
+
167
+ async def safe_execute_async(
168
+ f: Callable[P, Awaitable[R]], *args: P.args, **kwargs: P.kwargs
169
+ ) -> Result[R]:
170
+ """
171
+ Equivalent to `safe_execute` for async functions.
172
+ """
173
+ return await SafeTaskExecutor().execute_async(f, *args, **kwargs)
174
+
175
+
176
+ FailureReturnType = TypeVar("FailureReturnType")
177
+
178
+
179
+ def failsafe(
180
+ failure_return_value: FailureReturnType,
181
+ exceptions: Iterable[Type[Exception]] = (Exception,),
182
+ ignored_exceptions: Iterable[Type[Exception]] = (),
183
+ log_exceptions: bool = True,
184
+ log_exc_info: bool = True,
185
+ logger: logging.Logger | None = None,
186
+ ) -> Callable[[Callable[P, R]], Callable[P, R | FailureReturnType]]:
187
+ """
188
+ Decorator that executes sync functions with failsafe behavior: exceptions are caught and logged,
189
+ and a fallback return value is returned on failure instead of raising the exception.
190
+
191
+ Parameters are the same as SafeTaskExecutor plus:
192
+ - `failure_return_value`: value to return when an exception occurs
193
+
194
+ Usage:
195
+ ```python
196
+ @failsafe(
197
+ failure_return_value="default",
198
+ exceptions=(ValueError,),
199
+ ignored_exceptions=(KeyError,),
200
+ )
201
+ def failing_function(a: str) -> str:
202
+ raise ValueError(a)
203
+
204
+
205
+ result = failing_function("test")
206
+ # Returns "default" instead of raising ValueError
207
+ ```
208
+ """
209
+
210
+ def decorator(func: Callable[P, R]) -> Callable[P, R | FailureReturnType]:
211
+ executor = SafeTaskExecutor(
212
+ exceptions=exceptions,
213
+ ignored_exceptions=ignored_exceptions,
214
+ log_exceptions=log_exceptions,
215
+ log_exc_info=log_exc_info,
216
+ logger=logger,
217
+ )
218
+
219
+ @functools.wraps(func)
220
+ def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R | FailureReturnType:
221
+ result = executor.execute(func, *args, **kwargs)
222
+ return result.unpack(default=cast(R, failure_return_value))
223
+
224
+ return sync_wrapper
225
+
226
+ return decorator
227
+
228
+
229
+ def failsafe_async(
230
+ failure_return_value: FailureReturnType,
231
+ exceptions: Iterable[Type[Exception]] = (Exception,),
232
+ ignored_exceptions: Iterable[Type[Exception]] = (),
233
+ log_exceptions: bool = True,
234
+ log_exc_info: bool = True,
235
+ logger: logging.Logger | None = None,
236
+ ) -> Callable[
237
+ [Callable[P, Awaitable[R]]], Callable[P, Awaitable[R | FailureReturnType]]
238
+ ]:
239
+ """
240
+ Decorator that executes async functions with failsafe behavior: exceptions are caught and logged,
241
+ and a fallback return value is returned on failure instead of raising the exception.
242
+
243
+ Parameters are the same as SafeTaskExecutor plus:
244
+ - `failure_return_value`: value to return when an exception occurs
245
+
246
+ Usage:
247
+ ```python
248
+ @failsafe_async(
249
+ failure_return_value=[],
250
+ exceptions=(ValueError,),
251
+ ignored_exceptions=(KeyError,),
252
+ )
253
+ async def async_failing_function(a: str) -> list:
254
+ raise ValueError(a)
255
+
256
+
257
+ result = await async_failing_function("test")
258
+ # Returns [] instead of raising ValueError
259
+ ```
260
+ """
261
+
262
+ def decorator(
263
+ func: Callable[P, Awaitable[R]],
264
+ ) -> Callable[P, Awaitable[R | FailureReturnType]]:
265
+ executor = SafeTaskExecutor(
266
+ exceptions=exceptions,
267
+ ignored_exceptions=ignored_exceptions,
268
+ log_exceptions=log_exceptions,
269
+ log_exc_info=log_exc_info,
270
+ logger=logger,
271
+ )
272
+
273
+ @functools.wraps(func)
274
+ async def async_wrapper(
275
+ *args: P.args, **kwargs: P.kwargs
276
+ ) -> R | FailureReturnType:
277
+ result = await executor.execute_async(func, *args, **kwargs)
278
+ return result.unpack(default=cast(R, failure_return_value))
279
+
280
+ return async_wrapper
281
+
282
+ return decorator
@@ -0,0 +1,22 @@
1
+ # default schema follows logic in node-ingestion-worker: https://github.com/Unique-AG/monorepo/blob/76b4923611199a80abf9304639b3aa0538ec41ed/node/apps/node-ingestion-worker/src/ingestors/lib/text-manipulations.ts#L181C17-L181C28
2
+ from pydantic import BaseModel
3
+
4
+ from unique_toolkit.tools.config import get_configuration_dict
5
+
6
+
7
+ SOURCE_TEMPLATE = "<source${index}>${document}${info}${text}</source${index}>"
8
+ SECTIONS = {
9
+ "document": "<|document|>{}<|/document|>\n",
10
+ "info": "<|info|>{}<|/info|>\n",
11
+ }
12
+
13
+
14
+ class SourceFormatConfig(BaseModel):
15
+ model_config = get_configuration_dict()
16
+ source_template: str = SOURCE_TEMPLATE
17
+ sections: dict[str, str] = SECTIONS
18
+
19
+ @staticmethod
20
+ def template_to_pattern(template: str) -> str:
21
+ """Convert a template string into a regex pattern."""
22
+ return template.replace("{}", "(.*?)").replace("|", r"\|")
@@ -0,0 +1,207 @@
1
+ import re
2
+ from string import Template
3
+
4
+ from unique_toolkit.content.schemas import ContentChunk
5
+ from unique_toolkit.tools.utils.source_handling.schema import SourceFormatConfig
6
+
7
+
8
+ def _format_page_range(chunk: ContentChunk) -> str:
9
+ """Format page range string from chunk metadata."""
10
+ if not (
11
+ chunk.start_page
12
+ and chunk.end_page
13
+ and chunk.start_page > 0
14
+ and chunk.end_page > 0
15
+ ):
16
+ return ""
17
+ return (
18
+ str(chunk.start_page)
19
+ if chunk.start_page == chunk.end_page
20
+ else f"{chunk.start_page} - {chunk.end_page}"
21
+ )
22
+
23
+
24
+ def _parse_chunk(
25
+ chunk: ContentChunk, section_templates: dict[str, str]
26
+ ) -> dict[str, str]:
27
+ """Extract sections from chunk text using regex patterns."""
28
+ text = chunk.text
29
+ result = dict()
30
+
31
+ for section, template in section_templates.items():
32
+ # Document and info are the only sections that are included in the text
33
+ if section in [
34
+ "document",
35
+ "info",
36
+ ]: # Skip page as it's derived from metadata
37
+ pattern = SourceFormatConfig.template_to_pattern(template)
38
+ match = re.search(pattern, text, re.DOTALL)
39
+ result[section] = match.group(1) if match else ""
40
+ text = text.replace(match.group(0), "") if match else text
41
+
42
+ result["text"] = text.strip()
43
+ return result
44
+
45
+
46
+ def format_chunk(index: int, chunk: ContentChunk, config: SourceFormatConfig) -> str:
47
+ """
48
+ This function formats a content chunk based on a given configuration template and its sections. Each chunk in the database includes a document section, an optional info section, and a text section, with the text section being the primary content. Typically, chunks are added to sources in search modules without any changes. However, certain scenarios necessitate extra formatting, such as incorporating page numbers or other metadata. This function enables the custom formatting of chunks when they are appended as sources.
49
+
50
+ Args:
51
+ index (int): The source index number to be used in the template.
52
+ chunk (ContentChunk): A ContentChunk object containing:
53
+ - text (str): The main content text
54
+ - start_page (int, optional): Starting page number
55
+ - end_page (int, optional): Ending page number
56
+ - metadata (dict, optional): Additional metadata key-value pairs
57
+ config (SourceFormatConfig): Configuration object containing:
58
+ - source_template (str): The overall template for the output
59
+ - sections (dict): Mapping of section names to their format templates
60
+
61
+ Returns:
62
+ str: Formatted string according to the template
63
+
64
+ Examples:
65
+ Using XML-style config without page numbers (default):
66
+ >>> config = SourceFormatConfig(
67
+ ... source_template="<source${index}>${document}${info}${text}</source${index}>",
68
+ ... sections={
69
+ ... "document": "<|document|>{}<|/document|>\n",
70
+ ... "info": "<|info|>{}<|/info|>\n",
71
+ ... },
72
+ ... )
73
+ >>> chunk = ContentChunk(
74
+ ... text="<|document|>Sample Doc.pdf</|document|>\n<|info|>Important info</|info|>\nMain content"
75
+ ... )
76
+ >>> format_chunk(1, chunk, config)
77
+ '<source1><|document|>Sample Doc.pdf</|document|>\n<|info|>Important info</|info|>\nMain content</source1>'
78
+
79
+ Using XML-style config with page numbers:
80
+ >>> config = SourceFormatConfig(
81
+ ... source_template="<source${index}>${document}${page}${info}${text}</source${index}>",
82
+ ... sections={
83
+ ... "document": "<|document|>{}<|/document|>\n",
84
+ ... "info": "<|info|>{}<|/info|>\n",
85
+ ... "page": "<|page|>{}<|/page|>\n",
86
+ ... },
87
+ ... )
88
+ >>> chunk = ContentChunk(
89
+ ... text="<|document|>Sample Doc.pdf</|document|>\n<|info|>Important info</|info|>\nMain content",
90
+ ... start_page=1,
91
+ ... end_page=3,
92
+ ... )
93
+ >>> format_chunk(1, chunk, config)
94
+ '<source1><|document|>Sample Doc.pdf</|document|>\n<|page|>1 - 3</|page|>\n<|info|>Important info</|info|>\nMain content</source1>'
95
+
96
+ Using XML-style config with metadata:
97
+ >>> config = SourceFormatConfig(
98
+ ... source_template="<source${index}>${document}${date}${text}</source${index}>",
99
+ ... sections={
100
+ ... "document": "<|document|>{}<|/document|>\n",
101
+ ... "date": "<|DateFromMetaData|>{}<|/DateFromMetaData|>\n",
102
+ ... },
103
+ ... )
104
+ >>> chunk = ContentChunk(
105
+ ... text="<|document|>Sample Doc.pdf</|document|>\nMain content",
106
+ ... metadata={
107
+ ... "key": "metadata-key",
108
+ ... "mimeType": "text/plain",
109
+ ... "date": "12.03.2025",
110
+ ... },
111
+ ... )
112
+ >>> format_chunk(1, chunk, config)
113
+ '<source1><|document|>Sample Doc.pdf</|document|>\n<|DateFromMetaData|>12.03.2025</|DateFromMetaData|>\nMain content</source1>'
114
+
115
+ Using JSON-style config:
116
+ >>> config = SourceFormatConfig(
117
+ ... source_template="{'source_number': ${index}, 'content': '${document}${page}${info}${text}'}",
118
+ ... sections={
119
+ ... "document": "<|document|>{}<|/document|>\n",
120
+ ... "info": "<|info|>{}<|/info|>\n",
121
+ ... "page": "<|page|>{}<|/page|>\n",
122
+ ... },
123
+ ... )
124
+ >>> chunk = ContentChunk(
125
+ ... text="<|document|>Sample Doc.pdf</|document|>\n<|info|>Important info</|info|>\nMain content",
126
+ ... start_page=5,
127
+ ... end_page=5,
128
+ ... )
129
+ >>> format_chunk(1, chunk, config)
130
+ "{'source_number': 1, 'content': '<|document|>Sample Doc.pdf</|document|>\n<|page|>5</|page|>\n<|info|>Important info</|info|>\nMain content'}"
131
+
132
+ Notes:
133
+ - The function extracts document and info sections from the chunk text using regex patterns
134
+ - Page numbers are formatted as single numbers when start_page equals end_page
135
+ - Page numbers are formatted as ranges (e.g., "1 - 3") when start_page differs from end_page
136
+ - If page numbers are not available (None or 0), the page section will be empty
137
+ - Metadata keys that match section names (except 'document' and 'text') will be included in the output
138
+ - Metadata is processed by the _process_metadata function to update the parsed dictionary
139
+ - When using custom metadata tags like '<|DateFromMetaData|>', the key in chunk.metadata must match
140
+ the key in the sections dictionary (e.g., 'date' in the example above), not the tag name
141
+ """
142
+ sections = config.sections
143
+ source_template = config.source_template
144
+
145
+ parsed = _parse_chunk(chunk, sections)
146
+ parsed["page"] = _format_page_range(chunk)
147
+
148
+ # Update parsed with metadata values
149
+ parsed = _process_metadata(chunk, parsed, sections)
150
+
151
+ # Create a new dictionary to hold the formatted sections
152
+ formatted_sections = {}
153
+
154
+ # Process each section
155
+ for section, template in sections.items():
156
+ if parsed.get(section):
157
+ formatted_sections[section] = template.format(parsed.get(section, ""))
158
+ else:
159
+ formatted_sections[section] = ""
160
+
161
+ # Add the text section
162
+ formatted_sections["text"] = parsed["text"]
163
+
164
+ return Template(source_template).substitute(index=index, **formatted_sections)
165
+
166
+
167
+ def _process_metadata(
168
+ chunk: ContentChunk, parsed: dict[str, str], sections: dict[str, str]
169
+ ) -> dict[str, str]:
170
+ """
171
+ Process metadata from chunk and update the parsed dictionary.
172
+
173
+ This function extracts metadata from a ContentChunk object and updates the parsed
174
+ dictionary with values whose keys match section names defined in SourceFormatConfig.
175
+
176
+ Args:
177
+ chunk (ContentChunk): The content chunk containing metadata
178
+ parsed (dict): The dictionary of already parsed sections to update
179
+
180
+ Returns:
181
+ dict: The updated parsed dictionary with metadata values added
182
+
183
+ Notes:
184
+ - Keys 'document' and 'text' are explicitly excluded from metadata processing
185
+ - Only metadata keys that match section names in SourceFormatConfig will be processed
186
+ - If chunk.metadata is None or not iterable, the parsed dict is returned unchanged
187
+ - Metadata values are added directly to the parsed dictionary using their original keys
188
+ """
189
+ # Return unchanged parsed dict if metadata is None
190
+ if not hasattr(chunk, "metadata") or chunk.metadata is None:
191
+ return parsed
192
+
193
+ # Ensure metadata is a dictionary
194
+ metadata_dict = dict(chunk.metadata) if hasattr(chunk.metadata, "__iter__") else {}
195
+
196
+ # Define keys that should not be treated as metadata keys
197
+ excluded_keys = {"document", "info"}
198
+
199
+ # Get the keys from SourceFormatConfig.sections
200
+ valid_section_keys = set(sections.keys()) - excluded_keys
201
+
202
+ # Update parsed with valid metadata entries
203
+ for key, value in metadata_dict.items():
204
+ if key in valid_section_keys:
205
+ parsed[key] = value
206
+
207
+ return parsed