kolzchut-ragbot 1.6.0__py3-none-any.whl → 1.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kolzchut_ragbot/engine.py +19 -50
- kolzchut_ragbot/llm_client.py +1 -1
- {kolzchut_ragbot-1.6.0.dist-info → kolzchut_ragbot-1.7.1.dist-info}/METADATA +1 -1
- {kolzchut_ragbot-1.6.0.dist-info → kolzchut_ragbot-1.7.1.dist-info}/RECORD +6 -6
- {kolzchut_ragbot-1.6.0.dist-info → kolzchut_ragbot-1.7.1.dist-info}/WHEEL +0 -0
- {kolzchut_ragbot-1.6.0.dist-info → kolzchut_ragbot-1.7.1.dist-info}/top_level.txt +0 -0
kolzchut_ragbot/engine.py
CHANGED
@@ -170,20 +170,21 @@ class Engine:
|
|
170
170
|
|
171
171
|
return fused_list
|
172
172
|
|
173
|
-
def search_documents(self, query: str, top_k: int):
|
173
|
+
def search_documents(self, query: str, top_k: int,retrieval_size: int, max_documents_from_same_page:int):
|
174
174
|
"""
|
175
175
|
Searches for documents based on the query and returns the top_k results.
|
176
176
|
|
177
177
|
Args:
|
178
178
|
query (str): The query string.
|
179
179
|
top_k (int): The number of top documents to return.
|
180
|
-
|
180
|
+
retrieval_size (int, optional): The number of documents to fetch from each model.
|
181
|
+
max_documents_from_same_page (int, optional): The maximum number of documents (paragraphs acutually) to return from the same page.
|
181
182
|
Returns:
|
182
183
|
list: A list of top k documents.
|
183
184
|
"""
|
184
185
|
query_embeddings = {f"{semantic_model}": self.models[semantic_model].encode(query) for semantic_model in
|
185
186
|
definitions.models.keys()}
|
186
|
-
all_docs_by_model = self.elastic_model.search(query_embeddings)
|
187
|
+
all_docs_by_model = self.elastic_model.search(embedded_search=query_embeddings, size=retrieval_size)
|
187
188
|
all_docs = []
|
188
189
|
ids_for_fusion = []
|
189
190
|
all_docs_and_scores = {}
|
@@ -202,20 +203,20 @@ class Engine:
|
|
202
203
|
print(f"\nFusing {len(ids_for_fusion)} results\n")
|
203
204
|
fused_ids = self.reciprocal_rank_fusion(ids_for_fusion, k=top_k)
|
204
205
|
top_k_documents = []
|
205
|
-
|
206
|
+
count_per_id = {}
|
206
207
|
|
207
|
-
for fused_id in fused_ids:
|
208
|
+
for fused_id in fused_ids[:top_k]:
|
208
209
|
for doc in all_docs:
|
209
|
-
if doc["_source"]["page_id"] == fused_id
|
210
|
+
if doc["_source"]["page_id"] == fused_id:
|
211
|
+
count = count_per_id.get(fused_id, 0)
|
212
|
+
if count >= max_documents_from_same_page:
|
213
|
+
break;
|
210
214
|
top_k_documents.append(doc["_source"])
|
211
|
-
|
212
|
-
break
|
213
|
-
if len(top_titles) >= top_k:
|
214
|
-
break
|
215
|
+
count_per_id[fused_id] = count + 1
|
215
216
|
|
216
217
|
return top_k_documents, all_docs_and_scores
|
217
218
|
|
218
|
-
def answer_query(self, query, top_k: int, model, additional_document: dict = None, send_complete_pages_to_llm: bool = False):
|
219
|
+
def answer_query(self, query, top_k: int, model, additional_document: dict = None, send_complete_pages_to_llm: bool = False, retrieval_size: int = 50, max_documents_from_same_page:int=3):
|
219
220
|
"""
|
220
221
|
Answers a query using the top_k documents and the specified model.
|
221
222
|
|
@@ -225,23 +226,27 @@ class Engine:
|
|
225
226
|
model: The model to use for answering the query.
|
226
227
|
additional_document (dict, optional): An additional document to include in the search. Default is None.
|
227
228
|
send_complete_pages_to_llm (bool, optional): Whether to send complete pages to the
|
229
|
+
retrieval_size(int, optional): The number of documents to fetch from each model. Default is 50.
|
230
|
+
max_documents_from_same_page(int, optional): The maximum number of documents (paragraphs acutually) to return from the same page. Default is 3.
|
228
231
|
|
229
232
|
Returns:
|
230
233
|
tuple: A tuple containing the top k documents, the answer, and the stats.
|
231
234
|
"""
|
232
235
|
before_retrieval = time.perf_counter()
|
233
|
-
top_k_documents, all_docs_and_scores = self.search_documents(query, top_k)
|
236
|
+
top_k_documents, all_docs_and_scores = self.search_documents(query=query, top_k=top_k, retrieval_size=retrieval_size,max_documents_from_same_page=max_documents_from_same_page)
|
234
237
|
|
235
238
|
if send_complete_pages_to_llm:
|
236
239
|
top_k_documents = [self.transform_document_to_full_page(doc) for doc in top_k_documents]
|
237
240
|
|
241
|
+
top_k_documents_and_additional_document = [*top_k_documents, additional_document]
|
242
|
+
|
238
243
|
if additional_document:
|
239
|
-
|
244
|
+
top_k_documents_and_additional_document.append(additional_document)
|
240
245
|
|
241
246
|
retrieval_time = round(time.perf_counter() - before_retrieval, 4)
|
242
247
|
print(f"retrieval time: {retrieval_time}")
|
243
248
|
|
244
|
-
gpt_answer, gpt_elapsed, tokens, request_params = self.llms_client.answer(query,
|
249
|
+
gpt_answer, gpt_elapsed, tokens, request_params = self.llms_client.answer(query, top_k_documents_and_additional_document)
|
245
250
|
stats = {
|
246
251
|
"retrieval_time": retrieval_time,
|
247
252
|
"gpt_model": model,
|
@@ -286,42 +291,6 @@ class Engine:
|
|
286
291
|
full_document = unite_docs_to_single_instance(parts_of_documents)
|
287
292
|
return full_document
|
288
293
|
|
289
|
-
def transform_document_to_full_page(self, document: dict) -> dict:
|
290
|
-
"""
|
291
|
-
Adds the full page content to the document by retrieving it from Elasticsearch.
|
292
|
-
|
293
|
-
Args:
|
294
|
-
document (dict): The document to which the full page content will be added.
|
295
|
-
|
296
|
-
Returns:
|
297
|
-
dict: The updated document with the full page content added.
|
298
|
-
"""
|
299
|
-
if not document.get("page_id"):
|
300
|
-
return document
|
301
|
-
full_document = self.get_full_document_by_page_id(document["page_id"])
|
302
|
-
if full_document and full_document.get("content"):
|
303
|
-
document["content"] = full_document["content"]
|
304
|
-
return document
|
305
|
-
|
306
|
-
def get_full_document_by_page_id(self, page_id: int) -> dict | None:
|
307
|
-
"""
|
308
|
-
Retrieves a unified document instance for a given page_id by searching all indices in Elasticsearch.
|
309
|
-
|
310
|
-
Args:
|
311
|
-
page_id (int): The page ID to search for.
|
312
|
-
|
313
|
-
Returns:
|
314
|
-
dict | None: A single dict representing the united document (with metadata and concatenated content),
|
315
|
-
or None if no documents are found.
|
316
|
-
"""
|
317
|
-
es_client = self.elastic_model.es_client
|
318
|
-
indices = es_client.indices.get_alias(index="*").keys()
|
319
|
-
parts_of_documents = find_page_id_in_all_indices(page_id=page_id, es_client=es_client, indices=indices)
|
320
|
-
if not parts_of_documents:
|
321
|
-
return None
|
322
|
-
full_document = unite_docs_to_single_instance(parts_of_documents)
|
323
|
-
return full_document
|
324
|
-
|
325
294
|
|
326
295
|
engine = None
|
327
296
|
|
kolzchut_ragbot/llm_client.py
CHANGED
@@ -7,5 +7,5 @@ class LLMClient(ABC):
|
|
7
7
|
def __init__(self):
|
8
8
|
self.field_for_answer = definitions.field_for_llm
|
9
9
|
@abstractmethod
|
10
|
-
def answer(self, _question, _top_k_docs) -> tuple[str, float, int]:
|
10
|
+
def answer(self, _question, _top_k_docs) -> tuple[str, float, int, dict]:
|
11
11
|
raise NotImplementedError
|
@@ -2,11 +2,11 @@ kolzchut_ragbot/Document.py,sha256=5OyBBTZyAJFM_1Pjs3SUC-_s5zEJ5U6wjhw12_FFkdE,3
|
|
2
2
|
kolzchut_ragbot/IntegrateService.py,sha256=rcwUY2RkclCY3l8BGAmNbstdxhxwhLO9oA8BofqLyts,96
|
3
3
|
kolzchut_ragbot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
kolzchut_ragbot/config.py,sha256=uILFvgn9W92-NRaKXYtaoQXpn3KOWKK8SZYRsIAa5Yw,133
|
5
|
-
kolzchut_ragbot/engine.py,sha256=
|
5
|
+
kolzchut_ragbot/engine.py,sha256=_CMuQxJ1rmJt32kY0Wpp2acXPDDkP6A3tLSzWtSSaEQ,13238
|
6
6
|
kolzchut_ragbot/get_full_documents_utilities.py,sha256=YWljmGWM6h1ghLDCAUnDdhmn-0k6R_t7b1g7wSojzvg,1882
|
7
|
-
kolzchut_ragbot/llm_client.py,sha256=
|
7
|
+
kolzchut_ragbot/llm_client.py,sha256=JdDeOn2THpkOM2Mwe2DucTaYXul1fL2agIisBuHFtc8,347
|
8
8
|
kolzchut_ragbot/model.py,sha256=HCi3r4YztPknnbgTOA7I-GVaqxn8CzrTeLFkEg-7fg0,6320
|
9
|
-
kolzchut_ragbot-1.
|
10
|
-
kolzchut_ragbot-1.
|
11
|
-
kolzchut_ragbot-1.
|
12
|
-
kolzchut_ragbot-1.
|
9
|
+
kolzchut_ragbot-1.7.1.dist-info/METADATA,sha256=oKOVoVVM3_JzZWdcR5hBKgslY25TfLEIaNupCVQxTaM,1999
|
10
|
+
kolzchut_ragbot-1.7.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
11
|
+
kolzchut_ragbot-1.7.1.dist-info/top_level.txt,sha256=NTZoY4GGw3v_7jm0MgcdHw8simoZ78PsR7Meqmkgd_Q,16
|
12
|
+
kolzchut_ragbot-1.7.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|