kolzchut-ragbot 1.5.0__tar.gz → 1.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/PKG-INFO +1 -1
  2. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot/engine.py +83 -4
  3. kolzchut_ragbot-1.6.0/kolzchut_ragbot/get_full_documents_utilities.py +45 -0
  4. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot.egg-info/PKG-INFO +1 -1
  5. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot.egg-info/SOURCES.txt +1 -0
  6. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/pyproject.toml +1 -1
  7. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/setup.py +1 -1
  8. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/README.md +0 -0
  9. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot/Document.py +0 -0
  10. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot/IntegrateService.py +0 -0
  11. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot/__init__.py +0 -0
  12. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot/config.py +0 -0
  13. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot/llm_client.py +0 -0
  14. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot/model.py +0 -0
  15. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot.egg-info/dependency_links.txt +0 -0
  16. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot.egg-info/requires.txt +0 -0
  17. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/kolzchut_ragbot.egg-info/top_level.txt +0 -0
  18. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/setup.cfg +0 -0
  19. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/test/test_configs.py +0 -0
  20. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/test/test_docs.py +0 -0
  21. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/test/test_document.py +0 -0
  22. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/test/test_engine.py +0 -0
  23. {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.0}/test/test_model.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kolzchut-ragbot
3
- Version: 1.5.0
3
+ Version: 1.6.0
4
4
  Summary: A search engine using machine learning models and Elasticsearch for advanced document retrieval.
5
5
  Home-page: https://github.com/shmuelrob/rag-bot
6
6
  Author: Shmuel Robinov
@@ -7,7 +7,7 @@ from .model import es_client_factory
7
7
  from .Document import factory
8
8
  from sentence_transformers import SentenceTransformer
9
9
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
-
10
+ from .get_full_documents_utilities import find_page_id_in_all_indices, unite_docs_to_single_instance
11
11
  import torch
12
12
  import os
13
13
 
@@ -215,7 +215,7 @@ class Engine:
215
215
 
216
216
  return top_k_documents, all_docs_and_scores
217
217
 
218
- def answer_query(self, query, top_k: int, model):
218
+ def answer_query(self, query, top_k: int, model, additional_document: dict = None, send_complete_pages_to_llm: bool = False):
219
219
  """
220
220
  Answers a query using the top_k documents and the specified model.
221
221
 
@@ -223,6 +223,8 @@ class Engine:
223
223
  query (str): The query string.
224
224
  top_k (int): The number of top documents to use for answering the query.
225
225
  model: The model to use for answering the query.
226
+ additional_document (dict, optional): An additional document to include in the search. Default is None.
227
+ send_complete_pages_to_llm (bool, optional): Whether to send complete pages to the
226
228
 
227
229
  Returns:
228
230
  tuple: A tuple containing the top k documents, the answer, and the stats.
@@ -230,18 +232,95 @@ class Engine:
230
232
  before_retrieval = time.perf_counter()
231
233
  top_k_documents, all_docs_and_scores = self.search_documents(query, top_k)
232
234
 
235
+ if send_complete_pages_to_llm:
236
+ top_k_documents = [self.transform_document_to_full_page(doc) for doc in top_k_documents]
237
+
238
+ if additional_document:
239
+ top_k_documents.append(additional_document)
233
240
 
234
241
  retrieval_time = round(time.perf_counter() - before_retrieval, 4)
235
242
  print(f"retrieval time: {retrieval_time}")
236
243
 
237
- gpt_answer, gpt_elapsed, tokens = self.llms_client.answer(query, top_k_documents)
244
+ gpt_answer, gpt_elapsed, tokens, request_params = self.llms_client.answer(query, top_k_documents)
238
245
  stats = {
239
246
  "retrieval_time": retrieval_time,
240
247
  "gpt_model": model,
241
248
  "gpt_time": gpt_elapsed,
242
249
  "tokens": tokens
243
250
  }
244
- return top_k_documents, gpt_answer, stats, all_docs_and_scores
251
+ return top_k_documents, gpt_answer, stats, all_docs_and_scores, request_params
252
+
253
+ def transform_document_to_full_page(self, document: dict) -> dict:
254
+ """
255
+ Adds the full page content to the document by retrieving it from Elasticsearch.
256
+
257
+ Args:
258
+ document (dict): The document to which the full page content will be added.
259
+
260
+ Returns:
261
+ dict: The updated document with the full page content added.
262
+ """
263
+ if not document.get("page_id"):
264
+ return document
265
+ full_document = self.get_full_document_by_page_id(document["page_id"])
266
+ if full_document and full_document.get("content"):
267
+ document["content"] = full_document["content"]
268
+ return document
269
+
270
+ def get_full_document_by_page_id(self, page_id: int) -> dict | None:
271
+ """
272
+ Retrieves a unified document instance for a given page_id by searching all indices in Elasticsearch.
273
+
274
+ Args:
275
+ page_id (int): The page ID to search for.
276
+
277
+ Returns:
278
+ dict | None: A single dict representing the united document (with metadata and concatenated content),
279
+ or None if no documents are found.
280
+ """
281
+ es_client = self.elastic_model.es_client
282
+ indices = es_client.indices.get_alias(index="*").keys()
283
+ parts_of_documents = find_page_id_in_all_indices(page_id=page_id, es_client=es_client, indices=indices)
284
+ if not parts_of_documents:
285
+ return None
286
+ full_document = unite_docs_to_single_instance(parts_of_documents)
287
+ return full_document
288
+
289
+ def transform_document_to_full_page(self, document: dict) -> dict:
290
+ """
291
+ Adds the full page content to the document by retrieving it from Elasticsearch.
292
+
293
+ Args:
294
+ document (dict): The document to which the full page content will be added.
295
+
296
+ Returns:
297
+ dict: The updated document with the full page content added.
298
+ """
299
+ if not document.get("page_id"):
300
+ return document
301
+ full_document = self.get_full_document_by_page_id(document["page_id"])
302
+ if full_document and full_document.get("content"):
303
+ document["content"] = full_document["content"]
304
+ return document
305
+
306
+ def get_full_document_by_page_id(self, page_id: int) -> dict | None:
307
+ """
308
+ Retrieves a unified document instance for a given page_id by searching all indices in Elasticsearch.
309
+
310
+ Args:
311
+ page_id (int): The page ID to search for.
312
+
313
+ Returns:
314
+ dict | None: A single dict representing the united document (with metadata and concatenated content),
315
+ or None if no documents are found.
316
+ """
317
+ es_client = self.elastic_model.es_client
318
+ indices = es_client.indices.get_alias(index="*").keys()
319
+ parts_of_documents = find_page_id_in_all_indices(page_id=page_id, es_client=es_client, indices=indices)
320
+ if not parts_of_documents:
321
+ return None
322
+ full_document = unite_docs_to_single_instance(parts_of_documents)
323
+ return full_document
245
324
 
246
325
 
247
326
  engine = None
@@ -0,0 +1,45 @@
1
+ from elasticsearch import Elasticsearch
2
+ from typing import List, Dict, Any
3
+
4
+
5
+ def find_page_id_in_all_indices(page_id: int, es_client: Elasticsearch, indices: List[str]) -> List[Dict[str, Any]]:
6
+ """
7
+ Search all provided indices using the given es_client for documents with the given page_id.
8
+ Returns a list of all matching documents' _source fields.
9
+ """
10
+ all_docs = []
11
+ fixed_indicies = ["_".join(index.split("_")[1:]) for index in indices]
12
+ for index in fixed_indicies:
13
+ resp = es_client.search(index=index, body={
14
+ "query": {
15
+ "term": {"page_id": page_id}
16
+ }
17
+ }, size=100)
18
+ hits = resp.get('hits', {}).get('hits', [])
19
+ all_docs.extend([doc['_source'] for doc in hits])
20
+ return all_docs
21
+
22
+
23
+ def unite_docs_to_single_instance(docs: List[Dict[str, Any]]) -> Dict[str, Any] | None:
24
+ """
25
+ Unites a list of Elasticsearch document dicts (with the same page_id) into a single instance dict.
26
+ - Takes metadata fields (page_id, title, url, articleType, articleContentArea, etc.) from the first document.
27
+ - Concatenates all 'content' fields with a line break between them.
28
+ - Returns a single dict representing the united document, or None if docs is empty.
29
+ """
30
+ if not docs:
31
+ return None
32
+ first = docs[0]
33
+ united_content = '\n'.join([doc.get('content', '') for doc in docs if doc.get('content')])
34
+ instance = {
35
+ "page_id": first.get("page_id"),
36
+ "title": first.get("title"),
37
+ "url": first.get("url"),
38
+ "link": first.get("url").split("/")[-1] if first.get("url") else None,
39
+ "articleType": first.get("articleType"),
40
+ "articleContentArea": first.get("articleContentArea"),
41
+ "summary": first.get("summary"),
42
+ "categories": first.get("categories"),
43
+ "content": united_content
44
+ }
45
+ return instance
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kolzchut-ragbot
3
- Version: 1.5.0
3
+ Version: 1.6.0
4
4
  Summary: A search engine using machine learning models and Elasticsearch for advanced document retrieval.
5
5
  Home-page: https://github.com/shmuelrob/rag-bot
6
6
  Author: Shmuel Robinov
@@ -6,6 +6,7 @@ kolzchut_ragbot/IntegrateService.py
6
6
  kolzchut_ragbot/__init__.py
7
7
  kolzchut_ragbot/config.py
8
8
  kolzchut_ragbot/engine.py
9
+ kolzchut_ragbot/get_full_documents_utilities.py
9
10
  kolzchut_ragbot/llm_client.py
10
11
  kolzchut_ragbot/model.py
11
12
  kolzchut_ragbot.egg-info/PKG-INFO
@@ -14,7 +14,7 @@ push = false
14
14
 
15
15
  [tool.poetry]
16
16
  name = "ragbot"
17
- version = "1.5.0"
17
+ version = "1.6.0"
18
18
  description = ""
19
19
  authors = ["Your Name <your.email@example.com>"]
20
20
 
@@ -7,7 +7,7 @@ from setuptools import setup, find_packages
7
7
 
8
8
  setup(
9
9
  name='kolzchut-ragbot',
10
- version='1.5.0',
10
+ version='1.6.0',
11
11
  author='Shmuel Robinov',
12
12
  author_email='shmuel_robinov@webiks.com',
13
13
  description='A search engine using machine learning models and Elasticsearch for advanced document retrieval.',