kolzchut-ragbot 1.6.1__tar.gz → 1.7.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/PKG-INFO +1 -1
  2. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot/engine.py +19 -14
  3. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot.egg-info/PKG-INFO +1 -1
  4. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/pyproject.toml +1 -1
  5. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/setup.py +1 -1
  6. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/test/test_engine.py +4 -2
  7. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/README.md +0 -0
  8. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot/Document.py +0 -0
  9. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot/IntegrateService.py +0 -0
  10. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot/__init__.py +0 -0
  11. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot/config.py +0 -0
  12. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot/get_full_documents_utilities.py +0 -0
  13. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot/llm_client.py +0 -0
  14. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot/model.py +0 -0
  15. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot.egg-info/SOURCES.txt +0 -0
  16. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot.egg-info/dependency_links.txt +0 -0
  17. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot.egg-info/requires.txt +0 -0
  18. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/kolzchut_ragbot.egg-info/top_level.txt +0 -0
  19. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/setup.cfg +0 -0
  20. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/test/test_configs.py +0 -0
  21. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/test/test_docs.py +0 -0
  22. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/test/test_document.py +0 -0
  23. {kolzchut_ragbot-1.6.1 → kolzchut_ragbot-1.7.2}/test/test_model.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kolzchut-ragbot
3
- Version: 1.6.1
3
+ Version: 1.7.2
4
4
  Summary: A search engine using machine learning models and Elasticsearch for advanced document retrieval.
5
5
  Home-page: https://github.com/shmuelrob/rag-bot
6
6
  Author: Shmuel Robinov
@@ -170,20 +170,21 @@ class Engine:
170
170
 
171
171
  return fused_list
172
172
 
173
- def search_documents(self, query: str, top_k: int):
173
+ def search_documents(self, query: str, top_k: int,retrieval_size: int, max_documents_from_same_page:int):
174
174
  """
175
175
  Searches for documents based on the query and returns the top_k results.
176
176
 
177
177
  Args:
178
178
  query (str): The query string.
179
179
  top_k (int): The number of top documents to return.
180
-
180
+ retrieval_size (int, optional): The number of documents to fetch from each model.
181
+ max_documents_from_same_page (int, optional): The maximum number of documents (paragraphs acutually) to return from the same page.
181
182
  Returns:
182
183
  list: A list of top k documents.
183
184
  """
184
185
  query_embeddings = {f"{semantic_model}": self.models[semantic_model].encode(query) for semantic_model in
185
186
  definitions.models.keys()}
186
- all_docs_by_model = self.elastic_model.search(query_embeddings)
187
+ all_docs_by_model = self.elastic_model.search(embedded_search=query_embeddings, size=retrieval_size)
187
188
  all_docs = []
188
189
  ids_for_fusion = []
189
190
  all_docs_and_scores = {}
@@ -202,20 +203,20 @@ class Engine:
202
203
  print(f"\nFusing {len(ids_for_fusion)} results\n")
203
204
  fused_ids = self.reciprocal_rank_fusion(ids_for_fusion, k=top_k)
204
205
  top_k_documents = []
205
- top_titles = []
206
+ count_per_id = {}
206
207
 
207
- for fused_id in fused_ids:
208
+ for fused_id in fused_ids[:top_k]:
208
209
  for doc in all_docs:
209
- if doc["_source"]["page_id"] == fused_id and doc["_source"]["title"] not in top_titles:
210
+ if doc["_source"]["page_id"] == fused_id:
211
+ count = count_per_id.get(fused_id, 0)
212
+ if count >= max_documents_from_same_page:
213
+ break;
210
214
  top_k_documents.append(doc["_source"])
211
- top_titles.append(doc["_source"]["title"])
212
- break
213
- if len(top_titles) >= top_k:
214
- break
215
+ count_per_id[fused_id] = count + 1
215
216
 
216
217
  return top_k_documents, all_docs_and_scores
217
218
 
218
- def answer_query(self, query, top_k: int, model, additional_document: dict = None, send_complete_pages_to_llm: bool = False):
219
+ def answer_query(self, query, top_k: int, model, additional_document: dict = None, send_complete_pages_to_llm: bool = False, retrieval_size: int = 50, max_documents_from_same_page:int=3):
219
220
  """
220
221
  Answers a query using the top_k documents and the specified model.
221
222
 
@@ -225,23 +226,27 @@ class Engine:
225
226
  model: The model to use for answering the query.
226
227
  additional_document (dict, optional): An additional document to include in the search. Default is None.
227
228
  send_complete_pages_to_llm (bool, optional): Whether to send complete pages to the
229
+ retrieval_size(int, optional): The number of documents to fetch from each model. Default is 50.
230
+ max_documents_from_same_page(int, optional): The maximum number of documents (paragraphs acutually) to return from the same page. Default is 3.
228
231
 
229
232
  Returns:
230
233
  tuple: A tuple containing the top k documents, the answer, and the stats.
231
234
  """
232
235
  before_retrieval = time.perf_counter()
233
- top_k_documents, all_docs_and_scores = self.search_documents(query, top_k)
236
+ top_k_documents, all_docs_and_scores = self.search_documents(query=query, top_k=top_k, retrieval_size=retrieval_size,max_documents_from_same_page=max_documents_from_same_page)
234
237
 
235
238
  if send_complete_pages_to_llm:
236
239
  top_k_documents = [self.transform_document_to_full_page(doc) for doc in top_k_documents]
237
240
 
241
+ top_k_documents_and_additional_document = top_k_documents.copy()
242
+
238
243
  if additional_document:
239
- top_k_documents.append(additional_document)
244
+ top_k_documents_and_additional_document.append(additional_document)
240
245
 
241
246
  retrieval_time = round(time.perf_counter() - before_retrieval, 4)
242
247
  print(f"retrieval time: {retrieval_time}")
243
248
 
244
- gpt_answer, gpt_elapsed, tokens, request_params = self.llms_client.answer(query, top_k_documents)
249
+ gpt_answer, gpt_elapsed, tokens, request_params = self.llms_client.answer(query, top_k_documents_and_additional_document)
245
250
  stats = {
246
251
  "retrieval_time": retrieval_time,
247
252
  "gpt_model": model,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kolzchut-ragbot
3
- Version: 1.6.1
3
+ Version: 1.7.2
4
4
  Summary: A search engine using machine learning models and Elasticsearch for advanced document retrieval.
5
5
  Home-page: https://github.com/shmuelrob/rag-bot
6
6
  Author: Shmuel Robinov
@@ -14,7 +14,7 @@ push = false
14
14
 
15
15
  [tool.poetry]
16
16
  name = "ragbot"
17
- version = "1.6.1"
17
+ version = "1.7.2"
18
18
  description = ""
19
19
  authors = ["Your Name <your.email@example.com>"]
20
20
 
@@ -7,7 +7,7 @@ from setuptools import setup, find_packages
7
7
 
8
8
  setup(
9
9
  name='kolzchut-ragbot',
10
- version='1.6.1',
10
+ version='1.7.2',
11
11
  author='Shmuel Robinov',
12
12
  author_email='shmuel_robinov@webiks.com',
13
13
  description='A search engine using machine learning models and Elasticsearch for advanced document retrieval.',
@@ -111,7 +111,7 @@ class TestEngine:
111
111
  mock_reciprocal_rank_fusion.return_value = [3, 2, 4, 1, 5]
112
112
  engine = build_test_engine(es_model, llm_client)
113
113
 
114
- result, all_docs_and_scores = engine.search_documents("test query", 5)
114
+ result, all_docs_and_scores = engine.search_documents("test query", 5,50, 1)
115
115
 
116
116
  # Just verify the method completes without error
117
117
  assert result is not None
@@ -136,7 +136,9 @@ class TestEngine:
136
136
  ], {})
137
137
 
138
138
  llm_client.answer.return_value = ('answer', 0.5, 100, {})
139
- actual_top_k_documents, actual_gpt_answer, actual_stats, all_docs_and_score, request_params = engine.answer_query("test query", 5, 'gpt-4o')
139
+ # Make retrieval_time deterministic (0)
140
+ with patch('time.perf_counter', side_effect=[100.0, 100.0]):
141
+ actual_top_k_documents, actual_gpt_answer, actual_stats, all_docs_and_score, request_params = engine.answer_query("test query", 5, 'gpt-4o')
140
142
 
141
143
  expected_top_k_documents = [
142
144
  {'page_id': 3, 'title': 'title3'},