unique_toolkit 0.8.4__py3-none-any.whl → 0.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_toolkit/app/schemas.py +3 -0
- unique_toolkit/language_model/schemas.py +18 -0
- unique_toolkit/reference_manager/reference_manager.py +72 -0
- unique_toolkit/tools/agent_chunks_handler.py +62 -0
- unique_toolkit/tools/config.py +108 -0
- unique_toolkit/tools/{tool_factory.py → factory.py} +15 -5
- unique_toolkit/tools/schemas.py +138 -0
- unique_toolkit/tools/test/test_tool_progress_reporter.py +204 -0
- unique_toolkit/tools/tool.py +168 -0
- unique_toolkit/tools/tool_manager.py +242 -0
- unique_toolkit/tools/tool_progress_reporter.py +4 -11
- unique_toolkit/tools/utils/execution/execution.py +282 -0
- unique_toolkit/tools/utils/source_handling/schema.py +22 -0
- unique_toolkit/tools/utils/source_handling/source_formatting.py +207 -0
- unique_toolkit/tools/utils/source_handling/tests/test_source_formatting.py +215 -0
- {unique_toolkit-0.8.4.dist-info → unique_toolkit-0.8.5.dist-info}/METADATA +4 -1
- {unique_toolkit-0.8.4.dist-info → unique_toolkit-0.8.5.dist-info}/RECORD +19 -10
- unique_toolkit/tools/tool_definitions.py +0 -145
- unique_toolkit/tools/tool_definitionsV2.py +0 -137
- {unique_toolkit-0.8.4.dist-info → unique_toolkit-0.8.5.dist-info}/LICENSE +0 -0
- {unique_toolkit-0.8.4.dist-info → unique_toolkit-0.8.5.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import logging
|
|
3
|
+
from typing import (
|
|
4
|
+
Awaitable,
|
|
5
|
+
Callable,
|
|
6
|
+
Generic,
|
|
7
|
+
Iterable,
|
|
8
|
+
ParamSpec,
|
|
9
|
+
Type,
|
|
10
|
+
TypeVar,
|
|
11
|
+
cast,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
# Function types
|
|
15
|
+
P = ParamSpec("P")
|
|
16
|
+
R = TypeVar("R")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Result(Generic[R]):
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
success: bool,
|
|
26
|
+
result: R | None = None,
|
|
27
|
+
exception: Exception | None = None,
|
|
28
|
+
) -> None:
|
|
29
|
+
self._success = success
|
|
30
|
+
self._result = result
|
|
31
|
+
self._exception = exception
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def exception(self) -> Exception | None:
|
|
35
|
+
return self._exception
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def success(self) -> bool:
|
|
39
|
+
return self._success
|
|
40
|
+
|
|
41
|
+
def unpack(self, default: R | None = None) -> R:
|
|
42
|
+
return cast(R, self._result) if self.success else cast(R, default)
|
|
43
|
+
|
|
44
|
+
def __str__(self) -> str:
|
|
45
|
+
return (
|
|
46
|
+
f"Success: {str(self._result)}"
|
|
47
|
+
if self.success
|
|
48
|
+
else f"Failure: {str(self._exception)}"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class SafeTaskExecutor:
|
|
53
|
+
"""
|
|
54
|
+
Execute function calls "safely": exceptions are caught and logged,
|
|
55
|
+
and the function result is returned as a `Result` object.
|
|
56
|
+
|
|
57
|
+
Several parameters are available to customize the behavior of the executor:
|
|
58
|
+
- `exceptions`: a list of exceptions that should be caught and logged
|
|
59
|
+
- `ignored_exceptions`: a list of exceptions that should be passed through
|
|
60
|
+
- `log_exceptions`: whether to log exceptions
|
|
61
|
+
- `log_exc_info`: whether to log exception info
|
|
62
|
+
- `logger`: a logger to use for logging
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
Usage:
|
|
66
|
+
```python
|
|
67
|
+
executor = SafeTaskExecutor(
|
|
68
|
+
exceptions=(ValueError,),
|
|
69
|
+
ignored_exceptions=(KeyError,),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
executor.execute(failing_function, "test")
|
|
73
|
+
|
|
74
|
+
executor.execute_async(async_failing_function, "test")
|
|
75
|
+
```
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
exceptions: Iterable[Type[Exception]] = (Exception,),
|
|
81
|
+
ignored_exceptions: Iterable[Type[Exception]] = (),
|
|
82
|
+
log_exceptions: bool = True,
|
|
83
|
+
log_exc_info: bool = True,
|
|
84
|
+
logger: logging.Logger | None = None,
|
|
85
|
+
) -> None:
|
|
86
|
+
self._exceptions = tuple(exceptions)
|
|
87
|
+
self._ignored_exceptions = tuple(ignored_exceptions)
|
|
88
|
+
self._log_exceptions = log_exceptions
|
|
89
|
+
self._log_exc_info = log_exc_info
|
|
90
|
+
self._logger = logger
|
|
91
|
+
|
|
92
|
+
def execute(
|
|
93
|
+
self, f: Callable[P, R], *args: P.args, **kwargs: P.kwargs
|
|
94
|
+
) -> Result[R]:
|
|
95
|
+
try:
|
|
96
|
+
return Result(True, f(*args, **kwargs))
|
|
97
|
+
except self._exceptions as e:
|
|
98
|
+
if isinstance(e, self._ignored_exceptions):
|
|
99
|
+
raise e
|
|
100
|
+
if self._log_exceptions:
|
|
101
|
+
logger.error(f"Error in {f.__name__}: {e}", exc_info=self._log_exc_info)
|
|
102
|
+
return Result(False, exception=e)
|
|
103
|
+
|
|
104
|
+
async def execute_async(
|
|
105
|
+
self, f: Callable[P, Awaitable[R]], *args: P.args, **kwargs: P.kwargs
|
|
106
|
+
) -> Result[R]:
|
|
107
|
+
try:
|
|
108
|
+
return Result(True, await f(*args, **kwargs))
|
|
109
|
+
except self._exceptions as e:
|
|
110
|
+
if isinstance(e, self._ignored_exceptions):
|
|
111
|
+
raise e
|
|
112
|
+
if self._log_exceptions:
|
|
113
|
+
logger.error(f"Error in {f.__name__}: {e}", exc_info=self._log_exc_info)
|
|
114
|
+
return Result(False, exception=e)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def safe_execute(f: Callable[P, R], *args: P.args, **kwargs: P.kwargs) -> Result[R]:
|
|
118
|
+
"""
|
|
119
|
+
Execute a function call "safely": exceptions are caught and logged,
|
|
120
|
+
and the function result is returned as a `Result` object.
|
|
121
|
+
|
|
122
|
+
Usage:
|
|
123
|
+
```python
|
|
124
|
+
def failing_function(a : str) -> int:
|
|
125
|
+
raise ValueError(a)
|
|
126
|
+
|
|
127
|
+
result = safe_execute(failing_function, "test")
|
|
128
|
+
print(result)
|
|
129
|
+
>> Failure: ValueError('test')
|
|
130
|
+
|
|
131
|
+
result.success
|
|
132
|
+
>> False
|
|
133
|
+
|
|
134
|
+
result.unpack()
|
|
135
|
+
>> None
|
|
136
|
+
|
|
137
|
+
result.exception
|
|
138
|
+
>> ValueError('test')
|
|
139
|
+
|
|
140
|
+
result.unpack(default=1)
|
|
141
|
+
>> 1
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
def succeeding_function(a : str):
|
|
146
|
+
return a
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
result = safe_execute(succeeding_function, "test")
|
|
150
|
+
|
|
151
|
+
print(result)
|
|
152
|
+
>> Success: test
|
|
153
|
+
|
|
154
|
+
result.success
|
|
155
|
+
>> True
|
|
156
|
+
|
|
157
|
+
result.unpack()
|
|
158
|
+
>> 'test'
|
|
159
|
+
|
|
160
|
+
result.exception
|
|
161
|
+
>> None
|
|
162
|
+
```
|
|
163
|
+
"""
|
|
164
|
+
return SafeTaskExecutor().execute(f, *args, **kwargs)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
async def safe_execute_async(
|
|
168
|
+
f: Callable[P, Awaitable[R]], *args: P.args, **kwargs: P.kwargs
|
|
169
|
+
) -> Result[R]:
|
|
170
|
+
"""
|
|
171
|
+
Equivalent to `safe_execute` for async functions.
|
|
172
|
+
"""
|
|
173
|
+
return await SafeTaskExecutor().execute_async(f, *args, **kwargs)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
FailureReturnType = TypeVar("FailureReturnType")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def failsafe(
|
|
180
|
+
failure_return_value: FailureReturnType,
|
|
181
|
+
exceptions: Iterable[Type[Exception]] = (Exception,),
|
|
182
|
+
ignored_exceptions: Iterable[Type[Exception]] = (),
|
|
183
|
+
log_exceptions: bool = True,
|
|
184
|
+
log_exc_info: bool = True,
|
|
185
|
+
logger: logging.Logger | None = None,
|
|
186
|
+
) -> Callable[[Callable[P, R]], Callable[P, R | FailureReturnType]]:
|
|
187
|
+
"""
|
|
188
|
+
Decorator that executes sync functions with failsafe behavior: exceptions are caught and logged,
|
|
189
|
+
and a fallback return value is returned on failure instead of raising the exception.
|
|
190
|
+
|
|
191
|
+
Parameters are the same as SafeTaskExecutor plus:
|
|
192
|
+
- `failure_return_value`: value to return when an exception occurs
|
|
193
|
+
|
|
194
|
+
Usage:
|
|
195
|
+
```python
|
|
196
|
+
@failsafe(
|
|
197
|
+
failure_return_value="default",
|
|
198
|
+
exceptions=(ValueError,),
|
|
199
|
+
ignored_exceptions=(KeyError,),
|
|
200
|
+
)
|
|
201
|
+
def failing_function(a: str) -> str:
|
|
202
|
+
raise ValueError(a)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
result = failing_function("test")
|
|
206
|
+
# Returns "default" instead of raising ValueError
|
|
207
|
+
```
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
def decorator(func: Callable[P, R]) -> Callable[P, R | FailureReturnType]:
|
|
211
|
+
executor = SafeTaskExecutor(
|
|
212
|
+
exceptions=exceptions,
|
|
213
|
+
ignored_exceptions=ignored_exceptions,
|
|
214
|
+
log_exceptions=log_exceptions,
|
|
215
|
+
log_exc_info=log_exc_info,
|
|
216
|
+
logger=logger,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
@functools.wraps(func)
|
|
220
|
+
def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R | FailureReturnType:
|
|
221
|
+
result = executor.execute(func, *args, **kwargs)
|
|
222
|
+
return result.unpack(default=cast(R, failure_return_value))
|
|
223
|
+
|
|
224
|
+
return sync_wrapper
|
|
225
|
+
|
|
226
|
+
return decorator
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def failsafe_async(
|
|
230
|
+
failure_return_value: FailureReturnType,
|
|
231
|
+
exceptions: Iterable[Type[Exception]] = (Exception,),
|
|
232
|
+
ignored_exceptions: Iterable[Type[Exception]] = (),
|
|
233
|
+
log_exceptions: bool = True,
|
|
234
|
+
log_exc_info: bool = True,
|
|
235
|
+
logger: logging.Logger | None = None,
|
|
236
|
+
) -> Callable[
|
|
237
|
+
[Callable[P, Awaitable[R]]], Callable[P, Awaitable[R | FailureReturnType]]
|
|
238
|
+
]:
|
|
239
|
+
"""
|
|
240
|
+
Decorator that executes async functions with failsafe behavior: exceptions are caught and logged,
|
|
241
|
+
and a fallback return value is returned on failure instead of raising the exception.
|
|
242
|
+
|
|
243
|
+
Parameters are the same as SafeTaskExecutor plus:
|
|
244
|
+
- `failure_return_value`: value to return when an exception occurs
|
|
245
|
+
|
|
246
|
+
Usage:
|
|
247
|
+
```python
|
|
248
|
+
@failsafe_async(
|
|
249
|
+
failure_return_value=[],
|
|
250
|
+
exceptions=(ValueError,),
|
|
251
|
+
ignored_exceptions=(KeyError,),
|
|
252
|
+
)
|
|
253
|
+
async def async_failing_function(a: str) -> list:
|
|
254
|
+
raise ValueError(a)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
result = await async_failing_function("test")
|
|
258
|
+
# Returns [] instead of raising ValueError
|
|
259
|
+
```
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
def decorator(
|
|
263
|
+
func: Callable[P, Awaitable[R]],
|
|
264
|
+
) -> Callable[P, Awaitable[R | FailureReturnType]]:
|
|
265
|
+
executor = SafeTaskExecutor(
|
|
266
|
+
exceptions=exceptions,
|
|
267
|
+
ignored_exceptions=ignored_exceptions,
|
|
268
|
+
log_exceptions=log_exceptions,
|
|
269
|
+
log_exc_info=log_exc_info,
|
|
270
|
+
logger=logger,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
@functools.wraps(func)
|
|
274
|
+
async def async_wrapper(
|
|
275
|
+
*args: P.args, **kwargs: P.kwargs
|
|
276
|
+
) -> R | FailureReturnType:
|
|
277
|
+
result = await executor.execute_async(func, *args, **kwargs)
|
|
278
|
+
return result.unpack(default=cast(R, failure_return_value))
|
|
279
|
+
|
|
280
|
+
return async_wrapper
|
|
281
|
+
|
|
282
|
+
return decorator
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# default schema follows logic in node-ingestion-worker: https://github.com/Unique-AG/monorepo/blob/76b4923611199a80abf9304639b3aa0538ec41ed/node/apps/node-ingestion-worker/src/ingestors/lib/text-manipulations.ts#L181C17-L181C28
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
|
|
4
|
+
from unique_toolkit.tools.config import get_configuration_dict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
SOURCE_TEMPLATE = "<source${index}>${document}${info}${text}</source${index}>"
|
|
8
|
+
SECTIONS = {
|
|
9
|
+
"document": "<|document|>{}<|/document|>\n",
|
|
10
|
+
"info": "<|info|>{}<|/info|>\n",
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SourceFormatConfig(BaseModel):
|
|
15
|
+
model_config = get_configuration_dict()
|
|
16
|
+
source_template: str = SOURCE_TEMPLATE
|
|
17
|
+
sections: dict[str, str] = SECTIONS
|
|
18
|
+
|
|
19
|
+
@staticmethod
|
|
20
|
+
def template_to_pattern(template: str) -> str:
|
|
21
|
+
"""Convert a template string into a regex pattern."""
|
|
22
|
+
return template.replace("{}", "(.*?)").replace("|", r"\|")
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from string import Template
|
|
3
|
+
|
|
4
|
+
from unique_toolkit.content.schemas import ContentChunk
|
|
5
|
+
from unique_toolkit.tools.utils.source_handling.schema import SourceFormatConfig
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _format_page_range(chunk: ContentChunk) -> str:
|
|
9
|
+
"""Format page range string from chunk metadata."""
|
|
10
|
+
if not (
|
|
11
|
+
chunk.start_page
|
|
12
|
+
and chunk.end_page
|
|
13
|
+
and chunk.start_page > 0
|
|
14
|
+
and chunk.end_page > 0
|
|
15
|
+
):
|
|
16
|
+
return ""
|
|
17
|
+
return (
|
|
18
|
+
str(chunk.start_page)
|
|
19
|
+
if chunk.start_page == chunk.end_page
|
|
20
|
+
else f"{chunk.start_page} - {chunk.end_page}"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _parse_chunk(
|
|
25
|
+
chunk: ContentChunk, section_templates: dict[str, str]
|
|
26
|
+
) -> dict[str, str]:
|
|
27
|
+
"""Extract sections from chunk text using regex patterns."""
|
|
28
|
+
text = chunk.text
|
|
29
|
+
result = dict()
|
|
30
|
+
|
|
31
|
+
for section, template in section_templates.items():
|
|
32
|
+
# Document and info are the only sections that are included in the text
|
|
33
|
+
if section in [
|
|
34
|
+
"document",
|
|
35
|
+
"info",
|
|
36
|
+
]: # Skip page as it's derived from metadata
|
|
37
|
+
pattern = SourceFormatConfig.template_to_pattern(template)
|
|
38
|
+
match = re.search(pattern, text, re.DOTALL)
|
|
39
|
+
result[section] = match.group(1) if match else ""
|
|
40
|
+
text = text.replace(match.group(0), "") if match else text
|
|
41
|
+
|
|
42
|
+
result["text"] = text.strip()
|
|
43
|
+
return result
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def format_chunk(index: int, chunk: ContentChunk, config: SourceFormatConfig) -> str:
|
|
47
|
+
"""
|
|
48
|
+
This function formats a content chunk based on a given configuration template and its sections. Each chunk in the database includes a document section, an optional info section, and a text section, with the text section being the primary content. Typically, chunks are added to sources in search modules without any changes. However, certain scenarios necessitate extra formatting, such as incorporating page numbers or other metadata. This function enables the custom formatting of chunks when they are appended as sources.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
index (int): The source index number to be used in the template.
|
|
52
|
+
chunk (ContentChunk): A ContentChunk object containing:
|
|
53
|
+
- text (str): The main content text
|
|
54
|
+
- start_page (int, optional): Starting page number
|
|
55
|
+
- end_page (int, optional): Ending page number
|
|
56
|
+
- metadata (dict, optional): Additional metadata key-value pairs
|
|
57
|
+
config (SourceFormatConfig): Configuration object containing:
|
|
58
|
+
- source_template (str): The overall template for the output
|
|
59
|
+
- sections (dict): Mapping of section names to their format templates
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
str: Formatted string according to the template
|
|
63
|
+
|
|
64
|
+
Examples:
|
|
65
|
+
Using XML-style config without page numbers (default):
|
|
66
|
+
>>> config = SourceFormatConfig(
|
|
67
|
+
... source_template="<source${index}>${document}${info}${text}</source${index}>",
|
|
68
|
+
... sections={
|
|
69
|
+
... "document": "<|document|>{}<|/document|>\n",
|
|
70
|
+
... "info": "<|info|>{}<|/info|>\n",
|
|
71
|
+
... },
|
|
72
|
+
... )
|
|
73
|
+
>>> chunk = ContentChunk(
|
|
74
|
+
... text="<|document|>Sample Doc.pdf</|document|>\n<|info|>Important info</|info|>\nMain content"
|
|
75
|
+
... )
|
|
76
|
+
>>> format_chunk(1, chunk, config)
|
|
77
|
+
'<source1><|document|>Sample Doc.pdf</|document|>\n<|info|>Important info</|info|>\nMain content</source1>'
|
|
78
|
+
|
|
79
|
+
Using XML-style config with page numbers:
|
|
80
|
+
>>> config = SourceFormatConfig(
|
|
81
|
+
... source_template="<source${index}>${document}${page}${info}${text}</source${index}>",
|
|
82
|
+
... sections={
|
|
83
|
+
... "document": "<|document|>{}<|/document|>\n",
|
|
84
|
+
... "info": "<|info|>{}<|/info|>\n",
|
|
85
|
+
... "page": "<|page|>{}<|/page|>\n",
|
|
86
|
+
... },
|
|
87
|
+
... )
|
|
88
|
+
>>> chunk = ContentChunk(
|
|
89
|
+
... text="<|document|>Sample Doc.pdf</|document|>\n<|info|>Important info</|info|>\nMain content",
|
|
90
|
+
... start_page=1,
|
|
91
|
+
... end_page=3,
|
|
92
|
+
... )
|
|
93
|
+
>>> format_chunk(1, chunk, config)
|
|
94
|
+
'<source1><|document|>Sample Doc.pdf</|document|>\n<|page|>1 - 3</|page|>\n<|info|>Important info</|info|>\nMain content</source1>'
|
|
95
|
+
|
|
96
|
+
Using XML-style config with metadata:
|
|
97
|
+
>>> config = SourceFormatConfig(
|
|
98
|
+
... source_template="<source${index}>${document}${date}${text}</source${index}>",
|
|
99
|
+
... sections={
|
|
100
|
+
... "document": "<|document|>{}<|/document|>\n",
|
|
101
|
+
... "date": "<|DateFromMetaData|>{}<|/DateFromMetaData|>\n",
|
|
102
|
+
... },
|
|
103
|
+
... )
|
|
104
|
+
>>> chunk = ContentChunk(
|
|
105
|
+
... text="<|document|>Sample Doc.pdf</|document|>\nMain content",
|
|
106
|
+
... metadata={
|
|
107
|
+
... "key": "metadata-key",
|
|
108
|
+
... "mimeType": "text/plain",
|
|
109
|
+
... "date": "12.03.2025",
|
|
110
|
+
... },
|
|
111
|
+
... )
|
|
112
|
+
>>> format_chunk(1, chunk, config)
|
|
113
|
+
'<source1><|document|>Sample Doc.pdf</|document|>\n<|DateFromMetaData|>12.03.2025</|DateFromMetaData|>\nMain content</source1>'
|
|
114
|
+
|
|
115
|
+
Using JSON-style config:
|
|
116
|
+
>>> config = SourceFormatConfig(
|
|
117
|
+
... source_template="{'source_number': ${index}, 'content': '${document}${page}${info}${text}'}",
|
|
118
|
+
... sections={
|
|
119
|
+
... "document": "<|document|>{}<|/document|>\n",
|
|
120
|
+
... "info": "<|info|>{}<|/info|>\n",
|
|
121
|
+
... "page": "<|page|>{}<|/page|>\n",
|
|
122
|
+
... },
|
|
123
|
+
... )
|
|
124
|
+
>>> chunk = ContentChunk(
|
|
125
|
+
... text="<|document|>Sample Doc.pdf</|document|>\n<|info|>Important info</|info|>\nMain content",
|
|
126
|
+
... start_page=5,
|
|
127
|
+
... end_page=5,
|
|
128
|
+
... )
|
|
129
|
+
>>> format_chunk(1, chunk, config)
|
|
130
|
+
"{'source_number': 1, 'content': '<|document|>Sample Doc.pdf</|document|>\n<|page|>5</|page|>\n<|info|>Important info</|info|>\nMain content'}"
|
|
131
|
+
|
|
132
|
+
Notes:
|
|
133
|
+
- The function extracts document and info sections from the chunk text using regex patterns
|
|
134
|
+
- Page numbers are formatted as single numbers when start_page equals end_page
|
|
135
|
+
- Page numbers are formatted as ranges (e.g., "1 - 3") when start_page differs from end_page
|
|
136
|
+
- If page numbers are not available (None or 0), the page section will be empty
|
|
137
|
+
- Metadata keys that match section names (except 'document' and 'text') will be included in the output
|
|
138
|
+
- Metadata is processed by the _process_metadata function to update the parsed dictionary
|
|
139
|
+
- When using custom metadata tags like '<|DateFromMetaData|>', the key in chunk.metadata must match
|
|
140
|
+
the key in the sections dictionary (e.g., 'date' in the example above), not the tag name
|
|
141
|
+
"""
|
|
142
|
+
sections = config.sections
|
|
143
|
+
source_template = config.source_template
|
|
144
|
+
|
|
145
|
+
parsed = _parse_chunk(chunk, sections)
|
|
146
|
+
parsed["page"] = _format_page_range(chunk)
|
|
147
|
+
|
|
148
|
+
# Update parsed with metadata values
|
|
149
|
+
parsed = _process_metadata(chunk, parsed, sections)
|
|
150
|
+
|
|
151
|
+
# Create a new dictionary to hold the formatted sections
|
|
152
|
+
formatted_sections = {}
|
|
153
|
+
|
|
154
|
+
# Process each section
|
|
155
|
+
for section, template in sections.items():
|
|
156
|
+
if parsed.get(section):
|
|
157
|
+
formatted_sections[section] = template.format(parsed.get(section, ""))
|
|
158
|
+
else:
|
|
159
|
+
formatted_sections[section] = ""
|
|
160
|
+
|
|
161
|
+
# Add the text section
|
|
162
|
+
formatted_sections["text"] = parsed["text"]
|
|
163
|
+
|
|
164
|
+
return Template(source_template).substitute(index=index, **formatted_sections)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _process_metadata(
|
|
168
|
+
chunk: ContentChunk, parsed: dict[str, str], sections: dict[str, str]
|
|
169
|
+
) -> dict[str, str]:
|
|
170
|
+
"""
|
|
171
|
+
Process metadata from chunk and update the parsed dictionary.
|
|
172
|
+
|
|
173
|
+
This function extracts metadata from a ContentChunk object and updates the parsed
|
|
174
|
+
dictionary with values whose keys match section names defined in SourceFormatConfig.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
chunk (ContentChunk): The content chunk containing metadata
|
|
178
|
+
parsed (dict): The dictionary of already parsed sections to update
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
dict: The updated parsed dictionary with metadata values added
|
|
182
|
+
|
|
183
|
+
Notes:
|
|
184
|
+
- Keys 'document' and 'text' are explicitly excluded from metadata processing
|
|
185
|
+
- Only metadata keys that match section names in SourceFormatConfig will be processed
|
|
186
|
+
- If chunk.metadata is None or not iterable, the parsed dict is returned unchanged
|
|
187
|
+
- Metadata values are added directly to the parsed dictionary using their original keys
|
|
188
|
+
"""
|
|
189
|
+
# Return unchanged parsed dict if metadata is None
|
|
190
|
+
if not hasattr(chunk, "metadata") or chunk.metadata is None:
|
|
191
|
+
return parsed
|
|
192
|
+
|
|
193
|
+
# Ensure metadata is a dictionary
|
|
194
|
+
metadata_dict = dict(chunk.metadata) if hasattr(chunk.metadata, "__iter__") else {}
|
|
195
|
+
|
|
196
|
+
# Define keys that should not be treated as metadata keys
|
|
197
|
+
excluded_keys = {"document", "info"}
|
|
198
|
+
|
|
199
|
+
# Get the keys from SourceFormatConfig.sections
|
|
200
|
+
valid_section_keys = set(sections.keys()) - excluded_keys
|
|
201
|
+
|
|
202
|
+
# Update parsed with valid metadata entries
|
|
203
|
+
for key, value in metadata_dict.items():
|
|
204
|
+
if key in valid_section_keys:
|
|
205
|
+
parsed[key] = value
|
|
206
|
+
|
|
207
|
+
return parsed
|