PyPI - kolzchut-ragbot - Versions diffs - 1.6.0__py3-none-any.whl → 1.7.1__py3-none-any.whl - Mend

kolzchut-ragbot 1.6.0py3-none-any.whl → 1.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

kolzchut_ragbot/engine.py CHANGED Viewed

@@ -170,20 +170,21 @@ class Engine:
         return fused_list
-    def search_documents(self, query: str, top_k: int):
+    def search_documents(self, query: str, top_k: int,retrieval_size: int, max_documents_from_same_page:int):
         """
         Searches for documents based on the query and returns the top_k results.
         Args:
             query (str): The query string.
             top_k (int): The number of top documents to return.
+            retrieval_size (int, optional): The number of documents to fetch from each model.
+            max_documents_from_same_page (int, optional): The maximum number of documents (paragraphs acutually) to return from the same page.
         Returns:
             list: A list of top k documents.
         """
         query_embeddings = {f"{semantic_model}": self.models[semantic_model].encode(query) for semantic_model in
                             definitions.models.keys()}
-        all_docs_by_model = self.elastic_model.search(query_embeddings)
+        all_docs_by_model = self.elastic_model.search(embedded_search=query_embeddings, size=retrieval_size)
         all_docs = []
         ids_for_fusion = []
         all_docs_and_scores = {}
@@ -202,20 +203,20 @@ class Engine:
         print(f"\nFusing {len(ids_for_fusion)} results\n")
         fused_ids = self.reciprocal_rank_fusion(ids_for_fusion, k=top_k)
         top_k_documents = []
-        top_titles = []
+        count_per_id = {}
-        for fused_id in fused_ids:
+        for fused_id in fused_ids[:top_k]:
             for doc in all_docs:
-                if doc["_source"]["page_id"] == fused_id and doc["_source"]["title"] not in top_titles:
+                if doc["_source"]["page_id"] == fused_id:
+                    count = count_per_id.get(fused_id, 0)
+                    if count >= max_documents_from_same_page:
+                        break;
                     top_k_documents.append(doc["_source"])
-                    top_titles.append(doc["_source"]["title"])
-                    break
-            if len(top_titles) >= top_k:
-                break
+                    count_per_id[fused_id] = count + 1
         return top_k_documents, all_docs_and_scores
-    def answer_query(self, query, top_k: int, model, additional_document: dict = None, send_complete_pages_to_llm: bool = False):
+    def answer_query(self, query, top_k: int, model, additional_document: dict = None, send_complete_pages_to_llm: bool = False, retrieval_size: int = 50, max_documents_from_same_page:int=3):
         """
         Answers a query using the top_k documents and the specified model.
@@ -225,23 +226,27 @@ class Engine:
             model: The model to use for answering the query.
             additional_document (dict, optional): An additional document to include in the search. Default is None.
             send_complete_pages_to_llm (bool, optional): Whether to send complete pages to the
+            retrieval_size(int, optional): The number of documents to fetch from each model. Default is 50.
+            max_documents_from_same_page(int, optional): The maximum number of documents (paragraphs acutually) to return from the same page. Default is 3.
         Returns:
             tuple: A tuple containing the top k documents, the answer, and the stats.
         """
         before_retrieval = time.perf_counter()
-        top_k_documents, all_docs_and_scores = self.search_documents(query, top_k)
+        top_k_documents, all_docs_and_scores = self.search_documents(query=query, top_k=top_k, retrieval_size=retrieval_size,max_documents_from_same_page=max_documents_from_same_page)
         if send_complete_pages_to_llm:
             top_k_documents = [self.transform_document_to_full_page(doc) for doc in top_k_documents]
+        top_k_documents_and_additional_document = [*top_k_documents, additional_document]
         if additional_document:
-            top_k_documents.append(additional_document)
+            top_k_documents_and_additional_document.append(additional_document)
         retrieval_time = round(time.perf_counter() - before_retrieval, 4)
         print(f"retrieval time: {retrieval_time}")
-        gpt_answer, gpt_elapsed, tokens, request_params = self.llms_client.answer(query, top_k_documents)
+        gpt_answer, gpt_elapsed, tokens, request_params = self.llms_client.answer(query, top_k_documents_and_additional_document)
         stats = {
             "retrieval_time": retrieval_time,
             "gpt_model": model,
@@ -286,42 +291,6 @@ class Engine:
         full_document = unite_docs_to_single_instance(parts_of_documents)
         return full_document
-    def transform_document_to_full_page(self, document: dict) -> dict:
-        """
-        Adds the full page content to the document by retrieving it from Elasticsearch.
-        Args:
-            document (dict): The document to which the full page content will be added.
-        Returns:
-            dict: The updated document with the full page content added.
-        """
-        if not document.get("page_id"):
-            return document
-        full_document = self.get_full_document_by_page_id(document["page_id"])
-        if full_document and full_document.get("content"):
-            document["content"] = full_document["content"]
-        return document
-    def get_full_document_by_page_id(self, page_id: int) -> dict | None:
-        """
-        Retrieves a unified document instance for a given page_id by searching all indices in Elasticsearch.
-        Args:
-            page_id (int): The page ID to search for.
-        Returns:
-            dict | None: A single dict representing the united document (with metadata and concatenated content),
-                        or None if no documents are found.
-        """
-        es_client = self.elastic_model.es_client
-        indices = es_client.indices.get_alias(index="*").keys()
-        parts_of_documents = find_page_id_in_all_indices(page_id=page_id, es_client=es_client, indices=indices)
-        if not parts_of_documents:
-            return None
-        full_document = unite_docs_to_single_instance(parts_of_documents)
-        return full_document
 engine = None

kolzchut_ragbot/llm_client.py CHANGED Viewed

@@ -7,5 +7,5 @@ class LLMClient(ABC):
     def __init__(self):
         self.field_for_answer = definitions.field_for_llm
     @abstractmethod
-    def answer(self, _question, _top_k_docs) -> tuple[str, float, int]:
+    def answer(self, _question, _top_k_docs) -> tuple[str, float, int, dict]:
         raise NotImplementedError

{kolzchut_ragbot-1.6.0.dist-info → kolzchut_ragbot-1.7.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kolzchut-ragbot
-Version: 1.6.0
+Version: 1.7.1
 Summary: A search engine using machine learning models and Elasticsearch for advanced document retrieval.
 Home-page: https://github.com/shmuelrob/rag-bot
 Author: Shmuel Robinov

{kolzchut_ragbot-1.6.0.dist-info → kolzchut_ragbot-1.7.1.dist-info}/RECORD RENAMED Viewed

@@ -2,11 +2,11 @@ kolzchut_ragbot/Document.py,sha256=5OyBBTZyAJFM_1Pjs3SUC-_s5zEJ5U6wjhw12_FFkdE,3
 kolzchut_ragbot/IntegrateService.py,sha256=rcwUY2RkclCY3l8BGAmNbstdxhxwhLO9oA8BofqLyts,96
 kolzchut_ragbot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kolzchut_ragbot/config.py,sha256=uILFvgn9W92-NRaKXYtaoQXpn3KOWKK8SZYRsIAa5Yw,133
-kolzchut_ragbot/engine.py,sha256=PiobcVxrf4cMRaDVZvAadwyOLPx2wAUnpxRqftTNFYU,13848
+kolzchut_ragbot/engine.py,sha256=_CMuQxJ1rmJt32kY0Wpp2acXPDDkP6A3tLSzWtSSaEQ,13238
 kolzchut_ragbot/get_full_documents_utilities.py,sha256=YWljmGWM6h1ghLDCAUnDdhmn-0k6R_t7b1g7wSojzvg,1882
-kolzchut_ragbot/llm_client.py,sha256=Frp7CL0OIlQA6ltohrGWedI6uD6MpGg6TbpZTBE0qIo,341
+kolzchut_ragbot/llm_client.py,sha256=JdDeOn2THpkOM2Mwe2DucTaYXul1fL2agIisBuHFtc8,347
 kolzchut_ragbot/model.py,sha256=HCi3r4YztPknnbgTOA7I-GVaqxn8CzrTeLFkEg-7fg0,6320
-kolzchut_ragbot-1.6.0.dist-info/METADATA,sha256=pr5ZSZOXk44Yj7TdUPgdlFTLJJG6EjvCUQ73k1vjCRM,1999
-kolzchut_ragbot-1.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-kolzchut_ragbot-1.6.0.dist-info/top_level.txt,sha256=NTZoY4GGw3v_7jm0MgcdHw8simoZ78PsR7Meqmkgd_Q,16
-kolzchut_ragbot-1.6.0.dist-info/RECORD,,
+kolzchut_ragbot-1.7.1.dist-info/METADATA,sha256=oKOVoVVM3_JzZWdcR5hBKgslY25TfLEIaNupCVQxTaM,1999
+kolzchut_ragbot-1.7.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+kolzchut_ragbot-1.7.1.dist-info/top_level.txt,sha256=NTZoY4GGw3v_7jm0MgcdHw8simoZ78PsR7Meqmkgd_Q,16
+kolzchut_ragbot-1.7.1.dist-info/RECORD,,

{kolzchut_ragbot-1.6.0.dist-info → kolzchut_ragbot-1.7.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{kolzchut_ragbot-1.6.0.dist-info → kolzchut_ragbot-1.7.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

kolzchut-ragbot 1.6.0__py3-none-any.whl → 1.7.1__py3-none-any.whl

kolzchut-ragbot 1.6.0py3-none-any.whl → 1.7.1py3-none-any.whl