admin-api-lib 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- admin_api_lib/__init__.py +0 -0
- admin_api_lib/api_endpoints/document_deleter.py +24 -0
- admin_api_lib/api_endpoints/document_reference_retriever.py +25 -0
- admin_api_lib/api_endpoints/documents_status_retriever.py +20 -0
- admin_api_lib/api_endpoints/file_uploader.py +31 -0
- admin_api_lib/api_endpoints/source_uploader.py +40 -0
- admin_api_lib/api_endpoints/uploader_base.py +30 -0
- admin_api_lib/apis/__init__.py +0 -0
- admin_api_lib/apis/admin_api.py +197 -0
- admin_api_lib/apis/admin_api_base.py +120 -0
- admin_api_lib/chunker/__init__.py +0 -0
- admin_api_lib/chunker/chunker.py +25 -0
- admin_api_lib/dependency_container.py +236 -0
- admin_api_lib/extractor_api_client/__init__.py +0 -0
- admin_api_lib/extractor_api_client/openapi_client/__init__.py +38 -0
- admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +4 -0
- admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +516 -0
- admin_api_lib/extractor_api_client/openapi_client/api_client.py +695 -0
- admin_api_lib/extractor_api_client/openapi_client/api_response.py +20 -0
- admin_api_lib/extractor_api_client/openapi_client/configuration.py +460 -0
- admin_api_lib/extractor_api_client/openapi_client/exceptions.py +197 -0
- admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +21 -0
- admin_api_lib/extractor_api_client/openapi_client/models/content_type.py +34 -0
- admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py +103 -0
- admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +82 -0
- admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +104 -0
- admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py +92 -0
- admin_api_lib/extractor_api_client/openapi_client/rest.py +209 -0
- admin_api_lib/extractor_api_client/openapi_client/test/__init__.py +0 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py +35 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py +59 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py +56 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py +39 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py +62 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py +54 -0
- admin_api_lib/file_services/file_service.py +77 -0
- admin_api_lib/impl/__init__.py +0 -0
- admin_api_lib/impl/admin_api.py +167 -0
- admin_api_lib/impl/api_endpoints/default_document_deleter.py +84 -0
- admin_api_lib/impl/api_endpoints/default_document_reference_retriever.py +72 -0
- admin_api_lib/impl/api_endpoints/default_documents_status_retriever.py +41 -0
- admin_api_lib/impl/api_endpoints/default_file_uploader.py +234 -0
- admin_api_lib/impl/api_endpoints/default_source_uploader.py +202 -0
- admin_api_lib/impl/chunker/__init__.py +0 -0
- admin_api_lib/impl/chunker/chunker_type.py +11 -0
- admin_api_lib/impl/chunker/semantic_text_chunker.py +252 -0
- admin_api_lib/impl/chunker/text_chunker.py +33 -0
- admin_api_lib/impl/file_services/__init__.py +0 -0
- admin_api_lib/impl/file_services/s3_service.py +130 -0
- admin_api_lib/impl/information_enhancer/__init__.py +0 -0
- admin_api_lib/impl/information_enhancer/general_enhancer.py +52 -0
- admin_api_lib/impl/information_enhancer/page_summary_enhancer.py +62 -0
- admin_api_lib/impl/information_enhancer/summary_enhancer.py +74 -0
- admin_api_lib/impl/key_db/__init__.py +0 -0
- admin_api_lib/impl/key_db/file_status_key_value_store.py +111 -0
- admin_api_lib/impl/mapper/informationpiece2document.py +108 -0
- admin_api_lib/impl/settings/__init__.py +0 -0
- admin_api_lib/impl/settings/chunker_class_type_settings.py +18 -0
- admin_api_lib/impl/settings/chunker_settings.py +29 -0
- admin_api_lib/impl/settings/document_extractor_settings.py +21 -0
- admin_api_lib/impl/settings/key_value_settings.py +26 -0
- admin_api_lib/impl/settings/rag_api_settings.py +21 -0
- admin_api_lib/impl/settings/s3_settings.py +31 -0
- admin_api_lib/impl/settings/source_uploader_settings.py +23 -0
- admin_api_lib/impl/settings/summarizer_settings.py +86 -0
- admin_api_lib/impl/summarizer/__init__.py +0 -0
- admin_api_lib/impl/summarizer/langchain_summarizer.py +117 -0
- admin_api_lib/information_enhancer/__init__.py +0 -0
- admin_api_lib/information_enhancer/information_enhancer.py +34 -0
- admin_api_lib/main.py +54 -0
- admin_api_lib/models/__init__.py +0 -0
- admin_api_lib/models/document_status.py +86 -0
- admin_api_lib/models/extra_models.py +9 -0
- admin_api_lib/models/http_validation_error.py +105 -0
- admin_api_lib/models/key_value_pair.py +85 -0
- admin_api_lib/models/status.py +44 -0
- admin_api_lib/models/validation_error.py +104 -0
- admin_api_lib/models/validation_error_loc_inner.py +114 -0
- admin_api_lib/prompt_templates/__init__.py +0 -0
- admin_api_lib/prompt_templates/summarize_prompt.py +14 -0
- admin_api_lib/rag_backend_client/__init__.py +0 -0
- admin_api_lib/rag_backend_client/openapi_client/__init__.py +60 -0
- admin_api_lib/rag_backend_client/openapi_client/api/__init__.py +4 -0
- admin_api_lib/rag_backend_client/openapi_client/api/rag_api.py +968 -0
- admin_api_lib/rag_backend_client/openapi_client/api_client.py +698 -0
- admin_api_lib/rag_backend_client/openapi_client/api_response.py +22 -0
- admin_api_lib/rag_backend_client/openapi_client/configuration.py +460 -0
- admin_api_lib/rag_backend_client/openapi_client/exceptions.py +197 -0
- admin_api_lib/rag_backend_client/openapi_client/models/__init__.py +41 -0
- admin_api_lib/rag_backend_client/openapi_client/models/chat_history.py +99 -0
- admin_api_lib/rag_backend_client/openapi_client/models/chat_history_message.py +83 -0
- admin_api_lib/rag_backend_client/openapi_client/models/chat_request.py +93 -0
- admin_api_lib/rag_backend_client/openapi_client/models/chat_response.py +103 -0
- admin_api_lib/rag_backend_client/openapi_client/models/chat_role.py +35 -0
- admin_api_lib/rag_backend_client/openapi_client/models/content_type.py +37 -0
- admin_api_lib/rag_backend_client/openapi_client/models/delete_request.py +99 -0
- admin_api_lib/rag_backend_client/openapi_client/models/information_piece.py +110 -0
- admin_api_lib/rag_backend_client/openapi_client/models/key_value_pair.py +83 -0
- admin_api_lib/rag_backend_client/openapi_client/rest.py +209 -0
- admin_api_lib/summarizer/__init__.py +0 -0
- admin_api_lib/summarizer/summarizer.py +33 -0
- admin_api_lib/utils/__init__.py +0 -0
- admin_api_lib/utils/utils.py +32 -0
- admin_api_lib-3.2.0.dist-info/METADATA +24 -0
- admin_api_lib-3.2.0.dist-info/RECORD +106 -0
- admin_api_lib-3.2.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""Semantic text chunker backed by LangChain's semantic splitter.
|
|
2
|
+
|
|
3
|
+
Adds optional max/min chunk size enforcement via RecursiveCharacterTextSplitter
|
|
4
|
+
when both values are provided and ``max_chunk_size > min_chunk_size``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections.abc import Iterable
|
|
10
|
+
import re
|
|
11
|
+
from inspect import signature
|
|
12
|
+
from typing import Any, Type
|
|
13
|
+
import logging
|
|
14
|
+
|
|
15
|
+
from langchain_core.documents import Document
|
|
16
|
+
from langchain_core.embeddings import Embeddings
|
|
17
|
+
from langchain_experimental.text_splitter import SemanticChunker as LangchainSemanticChunker
|
|
18
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
19
|
+
from nltk.tokenize import PunktSentenceTokenizer
|
|
20
|
+
|
|
21
|
+
from admin_api_lib.chunker.chunker import Chunker
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SemanticTextChunker(Chunker):
|
|
28
|
+
"""Semantic text chunker backed by LangChain's semantic splitter with optional max/min chunk size enforcement."""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
embeddings: Embeddings,
|
|
33
|
+
*,
|
|
34
|
+
breakpoint_threshold_type: str | None = None,
|
|
35
|
+
breakpoint_threshold_amount: float | None = None,
|
|
36
|
+
buffer_size: int | None = None,
|
|
37
|
+
min_chunk_size: int | None = None,
|
|
38
|
+
max_chunk_size: int | None = None,
|
|
39
|
+
overlap: int | None = None,
|
|
40
|
+
chunker_cls: Type[LangchainSemanticChunker] = LangchainSemanticChunker,
|
|
41
|
+
recursive_text_splitter: RecursiveCharacterTextSplitter | None = None,
|
|
42
|
+
) -> None:
|
|
43
|
+
"""Initialise the semantic chunker.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
embeddings : Embeddings
|
|
48
|
+
The embeddings backend that powers semantic similarity detection.
|
|
49
|
+
breakpoint_threshold_type : str | None, optional
|
|
50
|
+
Strategy used to derive semantic breakpoints.
|
|
51
|
+
breakpoint_threshold_amount : float | None, optional
|
|
52
|
+
Threshold to apply for the selected breakpoint strategy.
|
|
53
|
+
buffer_size : int | None, optional
|
|
54
|
+
Number of neighbouring sentences to include for context.
|
|
55
|
+
min_chunk_size : int | None, optional
|
|
56
|
+
Minimum chunk size enforced by the chunker.
|
|
57
|
+
max_chunk_size : int | None, optional
|
|
58
|
+
Maximum chunk size enforced by the chunker.
|
|
59
|
+
overlap : int | None, optional
|
|
60
|
+
Number of overlapping characters between chunks.
|
|
61
|
+
chunker_cls : type[LangchainSemanticChunker], optional
|
|
62
|
+
Concrete semantic chunker implementation to instantiate. Defaults to
|
|
63
|
+
:class:`langchain_text_splitters.SemanticChunker`.
|
|
64
|
+
recursive_text_splitter : RecursiveCharacterTextSplitter | None, optional
|
|
65
|
+
Optional pre-configured recursive text splitter to use for max/min chunk size enforcement.
|
|
66
|
+
"""
|
|
67
|
+
self._min_chunk_size = min_chunk_size
|
|
68
|
+
self._max_chunk_size = max_chunk_size
|
|
69
|
+
self._overlap = overlap
|
|
70
|
+
|
|
71
|
+
init_params = _supported_parameters(chunker_cls)
|
|
72
|
+
candidate_kwargs: dict[str, Any] = {
|
|
73
|
+
"breakpoint_threshold_type": breakpoint_threshold_type,
|
|
74
|
+
"breakpoint_threshold_amount": breakpoint_threshold_amount,
|
|
75
|
+
"buffer_size": buffer_size,
|
|
76
|
+
"min_chunk_size": min_chunk_size,
|
|
77
|
+
}
|
|
78
|
+
filtered_kwargs = {
|
|
79
|
+
key: value for key, value in candidate_kwargs.items() if value is not None and key in init_params
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
self._semantic_chunker = chunker_cls(
|
|
83
|
+
embeddings=embeddings,
|
|
84
|
+
**filtered_kwargs,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Configure a recursive splitter for max/min enforcement when requested.
|
|
88
|
+
# If none provided, instantiate a sensible default using max_chunk_size and overlap.
|
|
89
|
+
self._recursive_splitter: RecursiveCharacterTextSplitter | None = None
|
|
90
|
+
if self._min_chunk_size and self._max_chunk_size and self._max_chunk_size > self._min_chunk_size:
|
|
91
|
+
if recursive_text_splitter is not None:
|
|
92
|
+
self._recursive_splitter = recursive_text_splitter
|
|
93
|
+
else:
|
|
94
|
+
self._recursive_splitter = RecursiveCharacterTextSplitter(
|
|
95
|
+
chunk_size=int(self._max_chunk_size),
|
|
96
|
+
chunk_overlap=int(self._overlap or 0),
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def chunk(self, documents: Iterable[Document]) -> list[Document]:
|
|
100
|
+
"""Split documents into chunks.
|
|
101
|
+
|
|
102
|
+
The documents are first processed by the semantic chunker,
|
|
103
|
+
and then any oversized chunks are re-split using the recursive text splitter,
|
|
104
|
+
if `self._recursive_splitter` and `self._min_chunk_size`/`self._max_chunk_size` are configured.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
documents : Iterable[Document]
|
|
109
|
+
Documents to be chunked.
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
list[Document]
|
|
114
|
+
Chunked documents.
|
|
115
|
+
"""
|
|
116
|
+
documents_list = list(documents)
|
|
117
|
+
if not documents_list:
|
|
118
|
+
return []
|
|
119
|
+
|
|
120
|
+
sem_chunks = self._semantic_chunker.split_documents(documents_list)
|
|
121
|
+
|
|
122
|
+
# If no max/min enforcement requested, return directly
|
|
123
|
+
if not self._recursive_splitter:
|
|
124
|
+
return sem_chunks
|
|
125
|
+
|
|
126
|
+
# Enforce max size by re-splitting only oversized chunks, then ensure minimum size
|
|
127
|
+
final_chunks: list[Document] = []
|
|
128
|
+
for chunk in sem_chunks:
|
|
129
|
+
text = chunk.page_content or ""
|
|
130
|
+
if len(text) <= self._max_chunk_size: # type: ignore[arg-type]
|
|
131
|
+
final_chunks.append(chunk)
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
# Split this oversized chunk using the recursive splitter, preserving metadata
|
|
135
|
+
sub_chunks = self._recursive_splitter.split_documents([chunk]) # type: ignore[union-attr]
|
|
136
|
+
|
|
137
|
+
# Ensure minimum size by balancing the last small chunk with its predecessor
|
|
138
|
+
balanced = self._rebalance_min_size(sub_chunks)
|
|
139
|
+
final_chunks.extend(balanced)
|
|
140
|
+
|
|
141
|
+
return final_chunks
|
|
142
|
+
|
|
143
|
+
def _rebalance_min_size(self, chunks: list[Document]) -> list[Document]:
|
|
144
|
+
"""Rebalance chunks so that the trailing chunk meets ``min_chunk_size`` when possible.
|
|
145
|
+
|
|
146
|
+
Strategy
|
|
147
|
+
--------
|
|
148
|
+
- If the last chunk is smaller than ``min_chunk_size`` and there is a previous chunk,
|
|
149
|
+
combine both and re-split into one or two chunks such that each is within
|
|
150
|
+
[min_chunk_size, max_chunk_size]. This guarantees no tiny tail when feasible.
|
|
151
|
+
- Otherwise, return the chunks unchanged.
|
|
152
|
+
"""
|
|
153
|
+
if not chunks or len(chunks) == 1:
|
|
154
|
+
return chunks
|
|
155
|
+
|
|
156
|
+
last = chunks[-1]
|
|
157
|
+
prev = chunks[-2]
|
|
158
|
+
|
|
159
|
+
overlap = int(self._overlap or 0)
|
|
160
|
+
prev_text = prev.page_content
|
|
161
|
+
last_text = last.page_content
|
|
162
|
+
tail = last_text[overlap:] if overlap > 0 else last_text
|
|
163
|
+
combined_text = prev_text + "\n" + tail
|
|
164
|
+
combined_len = len(combined_text)
|
|
165
|
+
|
|
166
|
+
# Case 1: Combined fits entirely under max -> single merged chunk
|
|
167
|
+
if combined_len <= self._max_chunk_size:
|
|
168
|
+
merged = Document(page_content=combined_text, metadata=prev.metadata)
|
|
169
|
+
return chunks[:-2] + [merged]
|
|
170
|
+
|
|
171
|
+
# Case 2: Split combined into two parts within [min, max] using sentence boundaries if possible
|
|
172
|
+
# Compute candidate breakpoints at sentence ends
|
|
173
|
+
boundaries = self._sentence_boundaries(combined_text)
|
|
174
|
+
# Ideal target for the first part: stay within [min,max] and leave >= min for the tail
|
|
175
|
+
target_first = combined_len - self._min_chunk_size
|
|
176
|
+
target_first = max(self._min_chunk_size, min(target_first, self._max_chunk_size))
|
|
177
|
+
|
|
178
|
+
# Filter boundaries that satisfy constraints for both parts
|
|
179
|
+
valid = self._filter_boundaries(boundaries, combined_len)
|
|
180
|
+
|
|
181
|
+
cut_at = None
|
|
182
|
+
if valid:
|
|
183
|
+
# choose boundary closest to target_first
|
|
184
|
+
cut_at = min(valid, key=lambda i: abs(i - target_first))
|
|
185
|
+
else:
|
|
186
|
+
# As a conservative fallback, try any boundary <= max that leaves a non-empty tail
|
|
187
|
+
candidates = [i for i in boundaries if i <= self._max_chunk_size and combined_len - i > 0]
|
|
188
|
+
if candidates:
|
|
189
|
+
cut_at = max(candidates)
|
|
190
|
+
|
|
191
|
+
if cut_at is None:
|
|
192
|
+
# Could not find a safe sentence boundary; keep original chunks
|
|
193
|
+
return chunks
|
|
194
|
+
|
|
195
|
+
first_text = combined_text[:cut_at]
|
|
196
|
+
second_text = combined_text[cut_at:]
|
|
197
|
+
first = Document(page_content=first_text, metadata=prev.metadata)
|
|
198
|
+
second = Document(page_content=second_text, metadata=prev.metadata)
|
|
199
|
+
return chunks[:-2] + [first, second]
|
|
200
|
+
|
|
201
|
+
def _filter_boundaries(self, boundaries: list[int], combined_len: int) -> list[int]:
|
|
202
|
+
"""Filter boundaries to find valid split points."""
|
|
203
|
+
valid = []
|
|
204
|
+
for idx in boundaries:
|
|
205
|
+
size1 = idx
|
|
206
|
+
size2 = combined_len - idx
|
|
207
|
+
if size1 < self._min_chunk_size or size1 > self._max_chunk_size:
|
|
208
|
+
continue
|
|
209
|
+
if size2 < self._min_chunk_size: # leave at least min for the tail
|
|
210
|
+
continue
|
|
211
|
+
valid.append(idx)
|
|
212
|
+
return valid
|
|
213
|
+
|
|
214
|
+
def _sentence_boundaries(self, text: str) -> list[int]:
|
|
215
|
+
"""Return indices in ``text`` that are good sentence breakpoints.
|
|
216
|
+
|
|
217
|
+
Tries NLTK's sentence tokenizer if available, otherwise uses a regex-based
|
|
218
|
+
heuristic to detect sentence ends. Indices are character offsets suitable
|
|
219
|
+
for slicing ``text[:idx]`` and ``text[idx:]``.
|
|
220
|
+
"""
|
|
221
|
+
try:
|
|
222
|
+
tokenizer = PunktSentenceTokenizer()
|
|
223
|
+
spans = list(tokenizer.span_tokenize(text))
|
|
224
|
+
return [end for (_, end) in spans]
|
|
225
|
+
except Exception:
|
|
226
|
+
logger.info("NLTK Punkt tokenizer unavailable, falling back to regex-based sentence boundary detection.")
|
|
227
|
+
# Regex heuristic: sentence end punctuation followed by whitespace/newline
|
|
228
|
+
boundaries: list[int] = []
|
|
229
|
+
for m in re.finditer(r"(?<=[\.!?])[\"'”)]*\s+", text):
|
|
230
|
+
boundaries.append(m.end())
|
|
231
|
+
# Ensure we don't return 0 or len(text) as boundaries
|
|
232
|
+
return [i for i in boundaries if 0 < i < len(text)]
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _supported_parameters(chunker_cls: type) -> set[str]:
|
|
236
|
+
"""Return constructor parameters supported by ``chunker_cls``.
|
|
237
|
+
|
|
238
|
+
Parameters
|
|
239
|
+
----------
|
|
240
|
+
chunker_cls : type
|
|
241
|
+
Semantic chunker class whose constructor signature should be inspected.
|
|
242
|
+
|
|
243
|
+
Returns
|
|
244
|
+
-------
|
|
245
|
+
set[str]
|
|
246
|
+
Set of keyword-parameter names accepted by the constructor.
|
|
247
|
+
"""
|
|
248
|
+
try:
|
|
249
|
+
params = signature(chunker_cls.__init__).parameters
|
|
250
|
+
except (TypeError, ValueError): # pragma: no cover - defensive, should not occur
|
|
251
|
+
return set()
|
|
252
|
+
return {name for name in params if name != "self"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Module containing the TextChunker class."""
|
|
2
|
+
|
|
3
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
4
|
+
from langchain_core.documents import Document
|
|
5
|
+
|
|
6
|
+
from admin_api_lib.chunker.chunker import Chunker
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TextChunker(Chunker):
|
|
10
|
+
"""A class that chunks text documents into smaller chunks."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, splitter: RecursiveCharacterTextSplitter):
|
|
13
|
+
# NOTE: `CharacterTextSplitter` does not take chunk_size into consideration
|
|
14
|
+
# See: https://github.com/langchain-ai/langchain/issues/10410#issuecomment-1712595675
|
|
15
|
+
# for that reason, we use the recursive splitter
|
|
16
|
+
self._splitter = splitter
|
|
17
|
+
|
|
18
|
+
def chunk(self, documents: list[Document]) -> list[Document]:
|
|
19
|
+
"""
|
|
20
|
+
Chunk the given documents into smaller chunks.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
documents : list[Document]
|
|
25
|
+
The documents to be chunked.
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
list[Document]
|
|
30
|
+
The list of chunked documents.
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
return self._splitter.split_documents(documents)
|
|
File without changes
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Class to handle I/O with S3 storage."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import BinaryIO
|
|
6
|
+
|
|
7
|
+
import boto3
|
|
8
|
+
|
|
9
|
+
from admin_api_lib.file_services.file_service import FileService
|
|
10
|
+
from admin_api_lib.impl.settings.s3_settings import S3Settings
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class S3Service(FileService):
|
|
16
|
+
"""Class to handle I/O with S3 storage."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, s3_settings: S3Settings):
|
|
19
|
+
"""Class to handle I/O with S3 storage.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
s3_settings: S3Settings
|
|
24
|
+
Settings for the s3. Must contain at least the endpoint, access_key_id, secret_access_key and bucket.
|
|
25
|
+
"""
|
|
26
|
+
self._s3_settings = s3_settings
|
|
27
|
+
self._s3_client = boto3.client(
|
|
28
|
+
"s3",
|
|
29
|
+
endpoint_url=s3_settings.endpoint,
|
|
30
|
+
aws_access_key_id=s3_settings.access_key_id,
|
|
31
|
+
aws_secret_access_key=s3_settings.secret_access_key,
|
|
32
|
+
aws_session_token=None,
|
|
33
|
+
config=boto3.session.Config(signature_version="s3v4"),
|
|
34
|
+
verify=False,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def download_folder(self, source: str, target: Path) -> None:
|
|
38
|
+
"""Download the remote folder on "source" to the local "target" directory.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
source: str
|
|
43
|
+
Path to the remote folder.
|
|
44
|
+
target: Path
|
|
45
|
+
Download destination path.
|
|
46
|
+
"""
|
|
47
|
+
target.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
search_response = self._s3_client.list_objects_v2(
|
|
50
|
+
Bucket=self._s3_settings.bucket,
|
|
51
|
+
Prefix=source,
|
|
52
|
+
)
|
|
53
|
+
for found_content in search_response.get("Contents", []):
|
|
54
|
+
file_source = found_content["Key"]
|
|
55
|
+
target_path = target / file_source[len(source) :]
|
|
56
|
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
with open(target_path, "wb") as local_file:
|
|
58
|
+
self.download_file(file_source, local_file)
|
|
59
|
+
|
|
60
|
+
def download_file(self, source: str, target_file: BinaryIO) -> None:
|
|
61
|
+
"""Read a single remote file "source" into the local "target_file" file-like object.
|
|
62
|
+
|
|
63
|
+
Example usage
|
|
64
|
+
=============
|
|
65
|
+
```
|
|
66
|
+
s3_settings: S3Settings = get_s3_settings()
|
|
67
|
+
s3_service = S3Service(endpoint="endpoint", username="username", password="password", bucket_name="bucket")
|
|
68
|
+
|
|
69
|
+
with tempfile.SpooledTemporaryFile(max_size=self._iot_forecast_settings.max_model_size) as temp_file:
|
|
70
|
+
s3_service.download_file("remote_file", temp_file)
|
|
71
|
+
# do stuff with temp_file
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
source: str
|
|
77
|
+
Path to the remote folder.
|
|
78
|
+
target_file: BinaryIO
|
|
79
|
+
File-like object to save the data to.
|
|
80
|
+
"""
|
|
81
|
+
self._s3_client.download_fileobj(self._s3_settings.bucket, source, target_file)
|
|
82
|
+
|
|
83
|
+
def upload_file(self, file_path: str, file_name: str) -> None:
|
|
84
|
+
"""
|
|
85
|
+
Upload a local file to the S3 bucket.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
source : Path
|
|
90
|
+
The path to the local file to upload.
|
|
91
|
+
target : str
|
|
92
|
+
The target path in the S3 bucket where the file will be stored.
|
|
93
|
+
"""
|
|
94
|
+
self._s3_client.upload_file(
|
|
95
|
+
Filename=file_path,
|
|
96
|
+
Bucket=self._s3_settings.bucket,
|
|
97
|
+
Key=file_name,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def get_all_sorted_file_names(self) -> list[str]:
|
|
101
|
+
"""Retrieve all file names stored in the S3 bucket.
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
list[str]
|
|
106
|
+
A list of file names stored in the S3 bucket.
|
|
107
|
+
"""
|
|
108
|
+
file_names = []
|
|
109
|
+
|
|
110
|
+
resp = self._s3_client.list_objects_v2(Bucket=self._s3_settings.bucket)
|
|
111
|
+
if resp.get("Contents"):
|
|
112
|
+
for obj in resp["Contents"]:
|
|
113
|
+
file_names.append(obj["Key"])
|
|
114
|
+
return file_names
|
|
115
|
+
|
|
116
|
+
def delete_file(self, file_name: str) -> None:
|
|
117
|
+
"""Delete a file from the S3 bucket.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
file_name : str
|
|
122
|
+
The name of the file to be deleted from the S3 bucket.
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
file_name = f"/{file_name}" if not file_name.startswith("/") else file_name
|
|
126
|
+
self._s3_client.delete_object(Bucket=self._s3_settings.bucket, Key=file_name)
|
|
127
|
+
logger.info("File %s successfully deleted.", file_name)
|
|
128
|
+
except Exception:
|
|
129
|
+
logger.exception("Error deleting file %s", file_name)
|
|
130
|
+
raise
|
|
File without changes
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Module containing the GeneralEnhancer class."""
|
|
2
|
+
|
|
3
|
+
from asyncio import gather
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from langchain_core.runnables import RunnableConfig, ensure_config
|
|
7
|
+
|
|
8
|
+
from admin_api_lib.information_enhancer.information_enhancer import (
|
|
9
|
+
InformationEnhancer,
|
|
10
|
+
RetrieverInput,
|
|
11
|
+
RetrieverOutput,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GeneralEnhancer(InformationEnhancer):
|
|
16
|
+
"""The GeneralEnhancer aggregates multiple InformationEnhancer instances.
|
|
17
|
+
|
|
18
|
+
InformationEnhancers are applied asynchronously to the input information.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, enhancers: list[InformationEnhancer]):
|
|
22
|
+
"""Initialize the GeneralEnhancer with a list of InformationEnhancer instances.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
enhancers : list of InformationEnhancer
|
|
27
|
+
A list of InformationEnhancer instances to be used by the GeneralEnhancer.
|
|
28
|
+
"""
|
|
29
|
+
super().__init__()
|
|
30
|
+
self._enhancers = enhancers
|
|
31
|
+
|
|
32
|
+
async def ainvoke(self, information: RetrieverInput, config: Optional[RunnableConfig] = None) -> RetrieverOutput:
|
|
33
|
+
"""Asynchronously invokes the each information enhancer with the given input and configuration.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
information : RetrieverInput
|
|
38
|
+
The input information to be processed by the general information enhancer.
|
|
39
|
+
config : Optional[RunnableConfig], optional
|
|
40
|
+
The configuration settings for the general information enhancer, by default None.
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
RetrieverOutput
|
|
45
|
+
The output after processing the input information.
|
|
46
|
+
"""
|
|
47
|
+
config = ensure_config(config)
|
|
48
|
+
summarize_tasks = [enhancer.ainvoke(information, config) for enhancer in self._enhancers]
|
|
49
|
+
summary_results = await gather(*summarize_tasks)
|
|
50
|
+
for summaries in summary_results:
|
|
51
|
+
information += summaries
|
|
52
|
+
return information
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Module for enhancing the summary of pages by grouping information by page and summarizing each page."""
|
|
2
|
+
|
|
3
|
+
from asyncio import gather
|
|
4
|
+
from hashlib import sha256
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from langchain_core.documents import Document
|
|
8
|
+
from langchain_core.runnables import RunnableConfig
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
from admin_api_lib.impl.information_enhancer.summary_enhancer import SummaryEnhancer
|
|
12
|
+
from rag_core_lib.impl.data_types.content_type import ContentType
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PageSummaryEnhancer(SummaryEnhancer):
|
|
16
|
+
"""
|
|
17
|
+
Enhances the summary of pages by grouping information by page and summarizing each page.
|
|
18
|
+
|
|
19
|
+
Attributes
|
|
20
|
+
----------
|
|
21
|
+
BASE64_IMAGE_KEY : str
|
|
22
|
+
Key used to identify base64 encoded images in metadata.
|
|
23
|
+
DEFAULT_PAGE_NR : int
|
|
24
|
+
Default page number used when no page metadata is available.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
BASE64_IMAGE_KEY = "base64_image"
|
|
28
|
+
DEFAULT_PAGE_NR = 1
|
|
29
|
+
|
|
30
|
+
async def _asummarize_page(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document:
|
|
31
|
+
full_page_content = " ".join([piece.page_content for piece in page_pieces])
|
|
32
|
+
summary = await self._summarizer.ainvoke(full_page_content, config)
|
|
33
|
+
meta = {key: value for key, value in page_pieces[0].metadata.items() if key != self.BASE64_IMAGE_KEY}
|
|
34
|
+
meta["id"] = sha256(str.encode(full_page_content)).hexdigest()
|
|
35
|
+
meta["related"] = meta["related"] + [piece.metadata["id"] for piece in page_pieces]
|
|
36
|
+
meta["related"] = list(set(meta["related"]))
|
|
37
|
+
meta["type"] = ContentType.SUMMARY.value
|
|
38
|
+
|
|
39
|
+
return Document(metadata=meta, page_content=summary)
|
|
40
|
+
|
|
41
|
+
async def _acreate_summary(self, information: list[Document], config: Optional[RunnableConfig]) -> list[Document]:
|
|
42
|
+
distinct_pages = []
|
|
43
|
+
for info in information:
|
|
44
|
+
if info.metadata.get("page", self.DEFAULT_PAGE_NR) not in distinct_pages:
|
|
45
|
+
distinct_pages.append(info.metadata.get("page", self.DEFAULT_PAGE_NR))
|
|
46
|
+
|
|
47
|
+
grouped = []
|
|
48
|
+
for page in distinct_pages:
|
|
49
|
+
group = []
|
|
50
|
+
for compare_info in information:
|
|
51
|
+
if compare_info.metadata.get("page", self.DEFAULT_PAGE_NR) == page:
|
|
52
|
+
group.append(compare_info)
|
|
53
|
+
if (
|
|
54
|
+
self._chunker_settings
|
|
55
|
+
and len(" ".join([item.page_content for item in group])) < self._chunker_settings.max_size
|
|
56
|
+
):
|
|
57
|
+
continue
|
|
58
|
+
grouped.append(group)
|
|
59
|
+
|
|
60
|
+
summary_tasks = [self._asummarize_page(info_group, config) for info_group in tqdm(grouped)]
|
|
61
|
+
|
|
62
|
+
return await gather(*summary_tasks)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Module with SummaryEnhancer class that enhances information by generating summaries using a provided Summarizer."""
|
|
2
|
+
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from admin_api_lib.impl.settings.chunker_settings import ChunkerSettings
|
|
7
|
+
from langchain_core.documents import Document
|
|
8
|
+
from langchain_core.runnables import RunnableConfig, ensure_config
|
|
9
|
+
|
|
10
|
+
from admin_api_lib.information_enhancer.information_enhancer import (
|
|
11
|
+
InformationEnhancer,
|
|
12
|
+
RetrieverInput,
|
|
13
|
+
RetrieverOutput,
|
|
14
|
+
)
|
|
15
|
+
from admin_api_lib.summarizer.summarizer import Summarizer
|
|
16
|
+
from rag_core_lib.impl.data_types.content_type import ContentType
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SummaryEnhancer(InformationEnhancer):
|
|
20
|
+
"""The SummaryEnhancer enhances information by generating summaries using a provided Summarizer instance.
|
|
21
|
+
|
|
22
|
+
Attributes
|
|
23
|
+
----------
|
|
24
|
+
INFORMATION_METADATA_TYPE : str
|
|
25
|
+
A constant string representing the type of information metadata.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
INFORMATION_METADATA_TYPE = "type"
|
|
29
|
+
|
|
30
|
+
def __init__(self, summarizer: Summarizer, chunker_settings: ChunkerSettings = None):
|
|
31
|
+
"""
|
|
32
|
+
Initialize the SummaryEnhancer with a given Summarizer instance.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
summarizer : Summarizer
|
|
37
|
+
An instance of the Summarizer class used to generate summaries.
|
|
38
|
+
"""
|
|
39
|
+
super().__init__()
|
|
40
|
+
self._summarizer = summarizer
|
|
41
|
+
self._chunker_settings = chunker_settings
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def _is_relevant(information: Document) -> bool:
|
|
45
|
+
match information.metadata.get(SummaryEnhancer.INFORMATION_METADATA_TYPE, ContentType.SUMMARY): # noqa:R503
|
|
46
|
+
case ContentType.SUMMARY | ContentType.IMAGE:
|
|
47
|
+
return False
|
|
48
|
+
case _:
|
|
49
|
+
return True
|
|
50
|
+
|
|
51
|
+
async def ainvoke(self, information: RetrieverInput, config: Optional[RunnableConfig] = None) -> RetrieverOutput:
|
|
52
|
+
"""
|
|
53
|
+
Asynchronously invokes the summary enhancer on the provided information.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
information : RetrieverInput
|
|
58
|
+
The input information to be processed and summarized.
|
|
59
|
+
config : Optional[RunnableConfig], optional
|
|
60
|
+
Configuration for the runnable, by default None.
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
RetrieverOutput
|
|
65
|
+
The summarized output of the provided information.
|
|
66
|
+
"""
|
|
67
|
+
config = ensure_config(config)
|
|
68
|
+
pieces_to_summarize = [info for info in information if self._is_relevant(info)]
|
|
69
|
+
return await self._acreate_summary(pieces_to_summarize, config)
|
|
70
|
+
|
|
71
|
+
@abstractmethod
|
|
72
|
+
async def _acreate_summary(
|
|
73
|
+
self, information: list[Document], config: Optional[RunnableConfig]
|
|
74
|
+
) -> list[Document]: ...
|
|
File without changes
|