admin-api-lib 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. admin_api_lib/__init__.py +0 -0
  2. admin_api_lib/api_endpoints/document_deleter.py +24 -0
  3. admin_api_lib/api_endpoints/document_reference_retriever.py +25 -0
  4. admin_api_lib/api_endpoints/documents_status_retriever.py +20 -0
  5. admin_api_lib/api_endpoints/file_uploader.py +31 -0
  6. admin_api_lib/api_endpoints/source_uploader.py +40 -0
  7. admin_api_lib/api_endpoints/uploader_base.py +30 -0
  8. admin_api_lib/apis/__init__.py +0 -0
  9. admin_api_lib/apis/admin_api.py +197 -0
  10. admin_api_lib/apis/admin_api_base.py +120 -0
  11. admin_api_lib/chunker/__init__.py +0 -0
  12. admin_api_lib/chunker/chunker.py +25 -0
  13. admin_api_lib/dependency_container.py +236 -0
  14. admin_api_lib/extractor_api_client/__init__.py +0 -0
  15. admin_api_lib/extractor_api_client/openapi_client/__init__.py +38 -0
  16. admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +4 -0
  17. admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +516 -0
  18. admin_api_lib/extractor_api_client/openapi_client/api_client.py +695 -0
  19. admin_api_lib/extractor_api_client/openapi_client/api_response.py +20 -0
  20. admin_api_lib/extractor_api_client/openapi_client/configuration.py +460 -0
  21. admin_api_lib/extractor_api_client/openapi_client/exceptions.py +197 -0
  22. admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +21 -0
  23. admin_api_lib/extractor_api_client/openapi_client/models/content_type.py +34 -0
  24. admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py +103 -0
  25. admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +82 -0
  26. admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +104 -0
  27. admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py +92 -0
  28. admin_api_lib/extractor_api_client/openapi_client/rest.py +209 -0
  29. admin_api_lib/extractor_api_client/openapi_client/test/__init__.py +0 -0
  30. admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py +35 -0
  31. admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py +59 -0
  32. admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py +56 -0
  33. admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py +39 -0
  34. admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py +62 -0
  35. admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py +54 -0
  36. admin_api_lib/file_services/file_service.py +77 -0
  37. admin_api_lib/impl/__init__.py +0 -0
  38. admin_api_lib/impl/admin_api.py +167 -0
  39. admin_api_lib/impl/api_endpoints/default_document_deleter.py +84 -0
  40. admin_api_lib/impl/api_endpoints/default_document_reference_retriever.py +72 -0
  41. admin_api_lib/impl/api_endpoints/default_documents_status_retriever.py +41 -0
  42. admin_api_lib/impl/api_endpoints/default_file_uploader.py +234 -0
  43. admin_api_lib/impl/api_endpoints/default_source_uploader.py +202 -0
  44. admin_api_lib/impl/chunker/__init__.py +0 -0
  45. admin_api_lib/impl/chunker/chunker_type.py +11 -0
  46. admin_api_lib/impl/chunker/semantic_text_chunker.py +252 -0
  47. admin_api_lib/impl/chunker/text_chunker.py +33 -0
  48. admin_api_lib/impl/file_services/__init__.py +0 -0
  49. admin_api_lib/impl/file_services/s3_service.py +130 -0
  50. admin_api_lib/impl/information_enhancer/__init__.py +0 -0
  51. admin_api_lib/impl/information_enhancer/general_enhancer.py +52 -0
  52. admin_api_lib/impl/information_enhancer/page_summary_enhancer.py +62 -0
  53. admin_api_lib/impl/information_enhancer/summary_enhancer.py +74 -0
  54. admin_api_lib/impl/key_db/__init__.py +0 -0
  55. admin_api_lib/impl/key_db/file_status_key_value_store.py +111 -0
  56. admin_api_lib/impl/mapper/informationpiece2document.py +108 -0
  57. admin_api_lib/impl/settings/__init__.py +0 -0
  58. admin_api_lib/impl/settings/chunker_class_type_settings.py +18 -0
  59. admin_api_lib/impl/settings/chunker_settings.py +29 -0
  60. admin_api_lib/impl/settings/document_extractor_settings.py +21 -0
  61. admin_api_lib/impl/settings/key_value_settings.py +26 -0
  62. admin_api_lib/impl/settings/rag_api_settings.py +21 -0
  63. admin_api_lib/impl/settings/s3_settings.py +31 -0
  64. admin_api_lib/impl/settings/source_uploader_settings.py +23 -0
  65. admin_api_lib/impl/settings/summarizer_settings.py +86 -0
  66. admin_api_lib/impl/summarizer/__init__.py +0 -0
  67. admin_api_lib/impl/summarizer/langchain_summarizer.py +117 -0
  68. admin_api_lib/information_enhancer/__init__.py +0 -0
  69. admin_api_lib/information_enhancer/information_enhancer.py +34 -0
  70. admin_api_lib/main.py +54 -0
  71. admin_api_lib/models/__init__.py +0 -0
  72. admin_api_lib/models/document_status.py +86 -0
  73. admin_api_lib/models/extra_models.py +9 -0
  74. admin_api_lib/models/http_validation_error.py +105 -0
  75. admin_api_lib/models/key_value_pair.py +85 -0
  76. admin_api_lib/models/status.py +44 -0
  77. admin_api_lib/models/validation_error.py +104 -0
  78. admin_api_lib/models/validation_error_loc_inner.py +114 -0
  79. admin_api_lib/prompt_templates/__init__.py +0 -0
  80. admin_api_lib/prompt_templates/summarize_prompt.py +14 -0
  81. admin_api_lib/rag_backend_client/__init__.py +0 -0
  82. admin_api_lib/rag_backend_client/openapi_client/__init__.py +60 -0
  83. admin_api_lib/rag_backend_client/openapi_client/api/__init__.py +4 -0
  84. admin_api_lib/rag_backend_client/openapi_client/api/rag_api.py +968 -0
  85. admin_api_lib/rag_backend_client/openapi_client/api_client.py +698 -0
  86. admin_api_lib/rag_backend_client/openapi_client/api_response.py +22 -0
  87. admin_api_lib/rag_backend_client/openapi_client/configuration.py +460 -0
  88. admin_api_lib/rag_backend_client/openapi_client/exceptions.py +197 -0
  89. admin_api_lib/rag_backend_client/openapi_client/models/__init__.py +41 -0
  90. admin_api_lib/rag_backend_client/openapi_client/models/chat_history.py +99 -0
  91. admin_api_lib/rag_backend_client/openapi_client/models/chat_history_message.py +83 -0
  92. admin_api_lib/rag_backend_client/openapi_client/models/chat_request.py +93 -0
  93. admin_api_lib/rag_backend_client/openapi_client/models/chat_response.py +103 -0
  94. admin_api_lib/rag_backend_client/openapi_client/models/chat_role.py +35 -0
  95. admin_api_lib/rag_backend_client/openapi_client/models/content_type.py +37 -0
  96. admin_api_lib/rag_backend_client/openapi_client/models/delete_request.py +99 -0
  97. admin_api_lib/rag_backend_client/openapi_client/models/information_piece.py +110 -0
  98. admin_api_lib/rag_backend_client/openapi_client/models/key_value_pair.py +83 -0
  99. admin_api_lib/rag_backend_client/openapi_client/rest.py +209 -0
  100. admin_api_lib/summarizer/__init__.py +0 -0
  101. admin_api_lib/summarizer/summarizer.py +33 -0
  102. admin_api_lib/utils/__init__.py +0 -0
  103. admin_api_lib/utils/utils.py +32 -0
  104. admin_api_lib-3.2.0.dist-info/METADATA +24 -0
  105. admin_api_lib-3.2.0.dist-info/RECORD +106 -0
  106. admin_api_lib-3.2.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,252 @@
1
+ """Semantic text chunker backed by LangChain's semantic splitter.
2
+
3
+ Adds optional max/min chunk size enforcement via RecursiveCharacterTextSplitter
4
+ when both values are provided and ``max_chunk_size > min_chunk_size``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from collections.abc import Iterable
10
+ import re
11
+ from inspect import signature
12
+ from typing import Any, Type
13
+ import logging
14
+
15
+ from langchain_core.documents import Document
16
+ from langchain_core.embeddings import Embeddings
17
+ from langchain_experimental.text_splitter import SemanticChunker as LangchainSemanticChunker
18
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
19
+ from nltk.tokenize import PunktSentenceTokenizer
20
+
21
+ from admin_api_lib.chunker.chunker import Chunker
22
+
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class SemanticTextChunker(Chunker):
28
+ """Semantic text chunker backed by LangChain's semantic splitter with optional max/min chunk size enforcement."""
29
+
30
+ def __init__(
31
+ self,
32
+ embeddings: Embeddings,
33
+ *,
34
+ breakpoint_threshold_type: str | None = None,
35
+ breakpoint_threshold_amount: float | None = None,
36
+ buffer_size: int | None = None,
37
+ min_chunk_size: int | None = None,
38
+ max_chunk_size: int | None = None,
39
+ overlap: int | None = None,
40
+ chunker_cls: Type[LangchainSemanticChunker] = LangchainSemanticChunker,
41
+ recursive_text_splitter: RecursiveCharacterTextSplitter | None = None,
42
+ ) -> None:
43
+ """Initialise the semantic chunker.
44
+
45
+ Parameters
46
+ ----------
47
+ embeddings : Embeddings
48
+ The embeddings backend that powers semantic similarity detection.
49
+ breakpoint_threshold_type : str | None, optional
50
+ Strategy used to derive semantic breakpoints.
51
+ breakpoint_threshold_amount : float | None, optional
52
+ Threshold to apply for the selected breakpoint strategy.
53
+ buffer_size : int | None, optional
54
+ Number of neighbouring sentences to include for context.
55
+ min_chunk_size : int | None, optional
56
+ Minimum chunk size enforced by the chunker.
57
+ max_chunk_size : int | None, optional
58
+ Maximum chunk size enforced by the chunker.
59
+ overlap : int | None, optional
60
+ Number of overlapping characters between chunks.
61
+ chunker_cls : type[LangchainSemanticChunker], optional
62
+ Concrete semantic chunker implementation to instantiate. Defaults to
63
+ :class:`langchain_text_splitters.SemanticChunker`.
64
+ recursive_text_splitter : RecursiveCharacterTextSplitter | None, optional
65
+ Optional pre-configured recursive text splitter to use for max/min chunk size enforcement.
66
+ """
67
+ self._min_chunk_size = min_chunk_size
68
+ self._max_chunk_size = max_chunk_size
69
+ self._overlap = overlap
70
+
71
+ init_params = _supported_parameters(chunker_cls)
72
+ candidate_kwargs: dict[str, Any] = {
73
+ "breakpoint_threshold_type": breakpoint_threshold_type,
74
+ "breakpoint_threshold_amount": breakpoint_threshold_amount,
75
+ "buffer_size": buffer_size,
76
+ "min_chunk_size": min_chunk_size,
77
+ }
78
+ filtered_kwargs = {
79
+ key: value for key, value in candidate_kwargs.items() if value is not None and key in init_params
80
+ }
81
+
82
+ self._semantic_chunker = chunker_cls(
83
+ embeddings=embeddings,
84
+ **filtered_kwargs,
85
+ )
86
+
87
+ # Configure a recursive splitter for max/min enforcement when requested.
88
+ # If none provided, instantiate a sensible default using max_chunk_size and overlap.
89
+ self._recursive_splitter: RecursiveCharacterTextSplitter | None = None
90
+ if self._min_chunk_size and self._max_chunk_size and self._max_chunk_size > self._min_chunk_size:
91
+ if recursive_text_splitter is not None:
92
+ self._recursive_splitter = recursive_text_splitter
93
+ else:
94
+ self._recursive_splitter = RecursiveCharacterTextSplitter(
95
+ chunk_size=int(self._max_chunk_size),
96
+ chunk_overlap=int(self._overlap or 0),
97
+ )
98
+
99
+ def chunk(self, documents: Iterable[Document]) -> list[Document]:
100
+ """Split documents into chunks.
101
+
102
+ The documents are first processed by the semantic chunker,
103
+ and then any oversized chunks are re-split using the recursive text splitter,
104
+ if `self._recursive_splitter` and `self._min_chunk_size`/`self._max_chunk_size` are configured.
105
+
106
+ Parameters
107
+ ----------
108
+ documents : Iterable[Document]
109
+ Documents to be chunked.
110
+
111
+ Returns
112
+ -------
113
+ list[Document]
114
+ Chunked documents.
115
+ """
116
+ documents_list = list(documents)
117
+ if not documents_list:
118
+ return []
119
+
120
+ sem_chunks = self._semantic_chunker.split_documents(documents_list)
121
+
122
+ # If no max/min enforcement requested, return directly
123
+ if not self._recursive_splitter:
124
+ return sem_chunks
125
+
126
+ # Enforce max size by re-splitting only oversized chunks, then ensure minimum size
127
+ final_chunks: list[Document] = []
128
+ for chunk in sem_chunks:
129
+ text = chunk.page_content or ""
130
+ if len(text) <= self._max_chunk_size: # type: ignore[arg-type]
131
+ final_chunks.append(chunk)
132
+ continue
133
+
134
+ # Split this oversized chunk using the recursive splitter, preserving metadata
135
+ sub_chunks = self._recursive_splitter.split_documents([chunk]) # type: ignore[union-attr]
136
+
137
+ # Ensure minimum size by balancing the last small chunk with its predecessor
138
+ balanced = self._rebalance_min_size(sub_chunks)
139
+ final_chunks.extend(balanced)
140
+
141
+ return final_chunks
142
+
143
+ def _rebalance_min_size(self, chunks: list[Document]) -> list[Document]:
144
+ """Rebalance chunks so that the trailing chunk meets ``min_chunk_size`` when possible.
145
+
146
+ Strategy
147
+ --------
148
+ - If the last chunk is smaller than ``min_chunk_size`` and there is a previous chunk,
149
+ combine both and re-split into one or two chunks such that each is within
150
+ [min_chunk_size, max_chunk_size]. This guarantees no tiny tail when feasible.
151
+ - Otherwise, return the chunks unchanged.
152
+ """
153
+ if not chunks or len(chunks) == 1:
154
+ return chunks
155
+
156
+ last = chunks[-1]
157
+ prev = chunks[-2]
158
+
159
+ overlap = int(self._overlap or 0)
160
+ prev_text = prev.page_content
161
+ last_text = last.page_content
162
+ tail = last_text[overlap:] if overlap > 0 else last_text
163
+ combined_text = prev_text + "\n" + tail
164
+ combined_len = len(combined_text)
165
+
166
+ # Case 1: Combined fits entirely under max -> single merged chunk
167
+ if combined_len <= self._max_chunk_size:
168
+ merged = Document(page_content=combined_text, metadata=prev.metadata)
169
+ return chunks[:-2] + [merged]
170
+
171
+ # Case 2: Split combined into two parts within [min, max] using sentence boundaries if possible
172
+ # Compute candidate breakpoints at sentence ends
173
+ boundaries = self._sentence_boundaries(combined_text)
174
+ # Ideal target for the first part: stay within [min,max] and leave >= min for the tail
175
+ target_first = combined_len - self._min_chunk_size
176
+ target_first = max(self._min_chunk_size, min(target_first, self._max_chunk_size))
177
+
178
+ # Filter boundaries that satisfy constraints for both parts
179
+ valid = self._filter_boundaries(boundaries, combined_len)
180
+
181
+ cut_at = None
182
+ if valid:
183
+ # choose boundary closest to target_first
184
+ cut_at = min(valid, key=lambda i: abs(i - target_first))
185
+ else:
186
+ # As a conservative fallback, try any boundary <= max that leaves a non-empty tail
187
+ candidates = [i for i in boundaries if i <= self._max_chunk_size and combined_len - i > 0]
188
+ if candidates:
189
+ cut_at = max(candidates)
190
+
191
+ if cut_at is None:
192
+ # Could not find a safe sentence boundary; keep original chunks
193
+ return chunks
194
+
195
+ first_text = combined_text[:cut_at]
196
+ second_text = combined_text[cut_at:]
197
+ first = Document(page_content=first_text, metadata=prev.metadata)
198
+ second = Document(page_content=second_text, metadata=prev.metadata)
199
+ return chunks[:-2] + [first, second]
200
+
201
+ def _filter_boundaries(self, boundaries: list[int], combined_len: int) -> list[int]:
202
+ """Filter boundaries to find valid split points."""
203
+ valid = []
204
+ for idx in boundaries:
205
+ size1 = idx
206
+ size2 = combined_len - idx
207
+ if size1 < self._min_chunk_size or size1 > self._max_chunk_size:
208
+ continue
209
+ if size2 < self._min_chunk_size: # leave at least min for the tail
210
+ continue
211
+ valid.append(idx)
212
+ return valid
213
+
214
+ def _sentence_boundaries(self, text: str) -> list[int]:
215
+ """Return indices in ``text`` that are good sentence breakpoints.
216
+
217
+ Tries NLTK's sentence tokenizer if available, otherwise uses a regex-based
218
+ heuristic to detect sentence ends. Indices are character offsets suitable
219
+ for slicing ``text[:idx]`` and ``text[idx:]``.
220
+ """
221
+ try:
222
+ tokenizer = PunktSentenceTokenizer()
223
+ spans = list(tokenizer.span_tokenize(text))
224
+ return [end for (_, end) in spans]
225
+ except Exception:
226
+ logger.info("NLTK Punkt tokenizer unavailable, falling back to regex-based sentence boundary detection.")
227
+ # Regex heuristic: sentence end punctuation followed by whitespace/newline
228
+ boundaries: list[int] = []
229
+ for m in re.finditer(r"(?<=[\.!?])[\"'”)]*\s+", text):
230
+ boundaries.append(m.end())
231
+ # Ensure we don't return 0 or len(text) as boundaries
232
+ return [i for i in boundaries if 0 < i < len(text)]
233
+
234
+
235
+ def _supported_parameters(chunker_cls: type) -> set[str]:
236
+ """Return constructor parameters supported by ``chunker_cls``.
237
+
238
+ Parameters
239
+ ----------
240
+ chunker_cls : type
241
+ Semantic chunker class whose constructor signature should be inspected.
242
+
243
+ Returns
244
+ -------
245
+ set[str]
246
+ Set of keyword-parameter names accepted by the constructor.
247
+ """
248
+ try:
249
+ params = signature(chunker_cls.__init__).parameters
250
+ except (TypeError, ValueError): # pragma: no cover - defensive, should not occur
251
+ return set()
252
+ return {name for name in params if name != "self"}
@@ -0,0 +1,33 @@
1
+ """Module containing the TextChunker class."""
2
+
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_core.documents import Document
5
+
6
+ from admin_api_lib.chunker.chunker import Chunker
7
+
8
+
9
+ class TextChunker(Chunker):
10
+ """A class that chunks text documents into smaller chunks."""
11
+
12
+ def __init__(self, splitter: RecursiveCharacterTextSplitter):
13
+ # NOTE: `CharacterTextSplitter` does not take chunk_size into consideration
14
+ # See: https://github.com/langchain-ai/langchain/issues/10410#issuecomment-1712595675
15
+ # for that reason, we use the recursive splitter
16
+ self._splitter = splitter
17
+
18
+ def chunk(self, documents: list[Document]) -> list[Document]:
19
+ """
20
+ Chunk the given documents into smaller chunks.
21
+
22
+ Parameters
23
+ ----------
24
+ documents : list[Document]
25
+ The documents to be chunked.
26
+
27
+ Returns
28
+ -------
29
+ list[Document]
30
+ The list of chunked documents.
31
+
32
+ """
33
+ return self._splitter.split_documents(documents)
File without changes
@@ -0,0 +1,130 @@
1
+ """Class to handle I/O with S3 storage."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import BinaryIO
6
+
7
+ import boto3
8
+
9
+ from admin_api_lib.file_services.file_service import FileService
10
+ from admin_api_lib.impl.settings.s3_settings import S3Settings
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class S3Service(FileService):
16
+ """Class to handle I/O with S3 storage."""
17
+
18
+ def __init__(self, s3_settings: S3Settings):
19
+ """Class to handle I/O with S3 storage.
20
+
21
+ Parameters
22
+ ----------
23
+ s3_settings: S3Settings
24
+ Settings for the s3. Must contain at least the endpoint, access_key_id, secret_access_key and bucket.
25
+ """
26
+ self._s3_settings = s3_settings
27
+ self._s3_client = boto3.client(
28
+ "s3",
29
+ endpoint_url=s3_settings.endpoint,
30
+ aws_access_key_id=s3_settings.access_key_id,
31
+ aws_secret_access_key=s3_settings.secret_access_key,
32
+ aws_session_token=None,
33
+ config=boto3.session.Config(signature_version="s3v4"),
34
+ verify=False,
35
+ )
36
+
37
+ def download_folder(self, source: str, target: Path) -> None:
38
+ """Download the remote folder on "source" to the local "target" directory.
39
+
40
+ Parameters
41
+ ----------
42
+ source: str
43
+ Path to the remote folder.
44
+ target: Path
45
+ Download destination path.
46
+ """
47
+ target.mkdir(parents=True, exist_ok=True)
48
+
49
+ search_response = self._s3_client.list_objects_v2(
50
+ Bucket=self._s3_settings.bucket,
51
+ Prefix=source,
52
+ )
53
+ for found_content in search_response.get("Contents", []):
54
+ file_source = found_content["Key"]
55
+ target_path = target / file_source[len(source) :]
56
+ target_path.parent.mkdir(parents=True, exist_ok=True)
57
+ with open(target_path, "wb") as local_file:
58
+ self.download_file(file_source, local_file)
59
+
60
+ def download_file(self, source: str, target_file: BinaryIO) -> None:
61
+ """Read a single remote file "source" into the local "target_file" file-like object.
62
+
63
+ Example usage
64
+ =============
65
+ ```
66
+ s3_settings: S3Settings = get_s3_settings()
67
+ s3_service = S3Service(endpoint="endpoint", username="username", password="password", bucket_name="bucket")
68
+
69
+ with tempfile.SpooledTemporaryFile(max_size=self._iot_forecast_settings.max_model_size) as temp_file:
70
+ s3_service.download_file("remote_file", temp_file)
71
+ # do stuff with temp_file
72
+ ```
73
+
74
+ Parameters
75
+ ----------
76
+ source: str
77
+ Path to the remote folder.
78
+ target_file: BinaryIO
79
+ File-like object to save the data to.
80
+ """
81
+ self._s3_client.download_fileobj(self._s3_settings.bucket, source, target_file)
82
+
83
+ def upload_file(self, file_path: str, file_name: str) -> None:
84
+ """
85
+ Upload a local file to the S3 bucket.
86
+
87
+ Parameters
88
+ ----------
89
+ source : Path
90
+ The path to the local file to upload.
91
+ target : str
92
+ The target path in the S3 bucket where the file will be stored.
93
+ """
94
+ self._s3_client.upload_file(
95
+ Filename=file_path,
96
+ Bucket=self._s3_settings.bucket,
97
+ Key=file_name,
98
+ )
99
+
100
+ def get_all_sorted_file_names(self) -> list[str]:
101
+ """Retrieve all file names stored in the S3 bucket.
102
+
103
+ Returns
104
+ -------
105
+ list[str]
106
+ A list of file names stored in the S3 bucket.
107
+ """
108
+ file_names = []
109
+
110
+ resp = self._s3_client.list_objects_v2(Bucket=self._s3_settings.bucket)
111
+ if resp.get("Contents"):
112
+ for obj in resp["Contents"]:
113
+ file_names.append(obj["Key"])
114
+ return file_names
115
+
116
+ def delete_file(self, file_name: str) -> None:
117
+ """Delete a file from the S3 bucket.
118
+
119
+ Parameters
120
+ ----------
121
+ file_name : str
122
+ The name of the file to be deleted from the S3 bucket.
123
+ """
124
+ try:
125
+ file_name = f"/{file_name}" if not file_name.startswith("/") else file_name
126
+ self._s3_client.delete_object(Bucket=self._s3_settings.bucket, Key=file_name)
127
+ logger.info("File %s successfully deleted.", file_name)
128
+ except Exception:
129
+ logger.exception("Error deleting file %s", file_name)
130
+ raise
File without changes
@@ -0,0 +1,52 @@
1
+ """Module containing the GeneralEnhancer class."""
2
+
3
+ from asyncio import gather
4
+ from typing import Optional
5
+
6
+ from langchain_core.runnables import RunnableConfig, ensure_config
7
+
8
+ from admin_api_lib.information_enhancer.information_enhancer import (
9
+ InformationEnhancer,
10
+ RetrieverInput,
11
+ RetrieverOutput,
12
+ )
13
+
14
+
15
+ class GeneralEnhancer(InformationEnhancer):
16
+ """The GeneralEnhancer aggregates multiple InformationEnhancer instances.
17
+
18
+ InformationEnhancers are applied asynchronously to the input information.
19
+ """
20
+
21
+ def __init__(self, enhancers: list[InformationEnhancer]):
22
+ """Initialize the GeneralEnhancer with a list of InformationEnhancer instances.
23
+
24
+ Parameters
25
+ ----------
26
+ enhancers : list of InformationEnhancer
27
+ A list of InformationEnhancer instances to be used by the GeneralEnhancer.
28
+ """
29
+ super().__init__()
30
+ self._enhancers = enhancers
31
+
32
+ async def ainvoke(self, information: RetrieverInput, config: Optional[RunnableConfig] = None) -> RetrieverOutput:
33
+ """Asynchronously invokes the each information enhancer with the given input and configuration.
34
+
35
+ Parameters
36
+ ----------
37
+ information : RetrieverInput
38
+ The input information to be processed by the general information enhancer.
39
+ config : Optional[RunnableConfig], optional
40
+ The configuration settings for the general information enhancer, by default None.
41
+
42
+ Returns
43
+ -------
44
+ RetrieverOutput
45
+ The output after processing the input information.
46
+ """
47
+ config = ensure_config(config)
48
+ summarize_tasks = [enhancer.ainvoke(information, config) for enhancer in self._enhancers]
49
+ summary_results = await gather(*summarize_tasks)
50
+ for summaries in summary_results:
51
+ information += summaries
52
+ return information
@@ -0,0 +1,62 @@
1
+ """Module for enhancing the summary of pages by grouping information by page and summarizing each page."""
2
+
3
+ from asyncio import gather
4
+ from hashlib import sha256
5
+ from typing import Optional
6
+
7
+ from langchain_core.documents import Document
8
+ from langchain_core.runnables import RunnableConfig
9
+ from tqdm import tqdm
10
+
11
+ from admin_api_lib.impl.information_enhancer.summary_enhancer import SummaryEnhancer
12
+ from rag_core_lib.impl.data_types.content_type import ContentType
13
+
14
+
15
+ class PageSummaryEnhancer(SummaryEnhancer):
16
+ """
17
+ Enhances the summary of pages by grouping information by page and summarizing each page.
18
+
19
+ Attributes
20
+ ----------
21
+ BASE64_IMAGE_KEY : str
22
+ Key used to identify base64 encoded images in metadata.
23
+ DEFAULT_PAGE_NR : int
24
+ Default page number used when no page metadata is available.
25
+ """
26
+
27
+ BASE64_IMAGE_KEY = "base64_image"
28
+ DEFAULT_PAGE_NR = 1
29
+
30
+ async def _asummarize_page(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document:
31
+ full_page_content = " ".join([piece.page_content for piece in page_pieces])
32
+ summary = await self._summarizer.ainvoke(full_page_content, config)
33
+ meta = {key: value for key, value in page_pieces[0].metadata.items() if key != self.BASE64_IMAGE_KEY}
34
+ meta["id"] = sha256(str.encode(full_page_content)).hexdigest()
35
+ meta["related"] = meta["related"] + [piece.metadata["id"] for piece in page_pieces]
36
+ meta["related"] = list(set(meta["related"]))
37
+ meta["type"] = ContentType.SUMMARY.value
38
+
39
+ return Document(metadata=meta, page_content=summary)
40
+
41
+ async def _acreate_summary(self, information: list[Document], config: Optional[RunnableConfig]) -> list[Document]:
42
+ distinct_pages = []
43
+ for info in information:
44
+ if info.metadata.get("page", self.DEFAULT_PAGE_NR) not in distinct_pages:
45
+ distinct_pages.append(info.metadata.get("page", self.DEFAULT_PAGE_NR))
46
+
47
+ grouped = []
48
+ for page in distinct_pages:
49
+ group = []
50
+ for compare_info in information:
51
+ if compare_info.metadata.get("page", self.DEFAULT_PAGE_NR) == page:
52
+ group.append(compare_info)
53
+ if (
54
+ self._chunker_settings
55
+ and len(" ".join([item.page_content for item in group])) < self._chunker_settings.max_size
56
+ ):
57
+ continue
58
+ grouped.append(group)
59
+
60
+ summary_tasks = [self._asummarize_page(info_group, config) for info_group in tqdm(grouped)]
61
+
62
+ return await gather(*summary_tasks)
@@ -0,0 +1,74 @@
1
+ """Module with SummaryEnhancer class that enhances information by generating summaries using a provided Summarizer."""
2
+
3
+ from abc import abstractmethod
4
+ from typing import Optional
5
+
6
+ from admin_api_lib.impl.settings.chunker_settings import ChunkerSettings
7
+ from langchain_core.documents import Document
8
+ from langchain_core.runnables import RunnableConfig, ensure_config
9
+
10
+ from admin_api_lib.information_enhancer.information_enhancer import (
11
+ InformationEnhancer,
12
+ RetrieverInput,
13
+ RetrieverOutput,
14
+ )
15
+ from admin_api_lib.summarizer.summarizer import Summarizer
16
+ from rag_core_lib.impl.data_types.content_type import ContentType
17
+
18
+
19
+ class SummaryEnhancer(InformationEnhancer):
20
+ """The SummaryEnhancer enhances information by generating summaries using a provided Summarizer instance.
21
+
22
+ Attributes
23
+ ----------
24
+ INFORMATION_METADATA_TYPE : str
25
+ A constant string representing the type of information metadata.
26
+ """
27
+
28
+ INFORMATION_METADATA_TYPE = "type"
29
+
30
+ def __init__(self, summarizer: Summarizer, chunker_settings: ChunkerSettings = None):
31
+ """
32
+ Initialize the SummaryEnhancer with a given Summarizer instance.
33
+
34
+ Parameters
35
+ ----------
36
+ summarizer : Summarizer
37
+ An instance of the Summarizer class used to generate summaries.
38
+ """
39
+ super().__init__()
40
+ self._summarizer = summarizer
41
+ self._chunker_settings = chunker_settings
42
+
43
+ @staticmethod
44
+ def _is_relevant(information: Document) -> bool:
45
+ match information.metadata.get(SummaryEnhancer.INFORMATION_METADATA_TYPE, ContentType.SUMMARY): # noqa:R503
46
+ case ContentType.SUMMARY | ContentType.IMAGE:
47
+ return False
48
+ case _:
49
+ return True
50
+
51
+ async def ainvoke(self, information: RetrieverInput, config: Optional[RunnableConfig] = None) -> RetrieverOutput:
52
+ """
53
+ Asynchronously invokes the summary enhancer on the provided information.
54
+
55
+ Parameters
56
+ ----------
57
+ information : RetrieverInput
58
+ The input information to be processed and summarized.
59
+ config : Optional[RunnableConfig], optional
60
+ Configuration for the runnable, by default None.
61
+
62
+ Returns
63
+ -------
64
+ RetrieverOutput
65
+ The summarized output of the provided information.
66
+ """
67
+ config = ensure_config(config)
68
+ pieces_to_summarize = [info for info in information if self._is_relevant(info)]
69
+ return await self._acreate_summary(pieces_to_summarize, config)
70
+
71
+ @abstractmethod
72
+ async def _acreate_summary(
73
+ self, information: list[Document], config: Optional[RunnableConfig]
74
+ ) -> list[Document]: ...
File without changes