kolzchut-ragbot 1.5.0__tar.gz → 1.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/PKG-INFO +1 -1
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot/engine.py +47 -4
- kolzchut_ragbot-1.6.1/kolzchut_ragbot/get_full_documents_utilities.py +45 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot/llm_client.py +1 -1
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot.egg-info/PKG-INFO +1 -1
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot.egg-info/SOURCES.txt +1 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/pyproject.toml +1 -1
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/setup.py +1 -1
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/test/test_engine.py +2 -3
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/README.md +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot/Document.py +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot/IntegrateService.py +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot/__init__.py +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot/config.py +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot/model.py +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot.egg-info/dependency_links.txt +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot.egg-info/requires.txt +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot.egg-info/top_level.txt +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/setup.cfg +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/test/test_configs.py +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/test/test_docs.py +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/test/test_document.py +0 -0
- {kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/test/test_model.py +0 -0
@@ -7,7 +7,7 @@ from .model import es_client_factory
|
|
7
7
|
from .Document import factory
|
8
8
|
from sentence_transformers import SentenceTransformer
|
9
9
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
10
|
-
|
10
|
+
from .get_full_documents_utilities import find_page_id_in_all_indices, unite_docs_to_single_instance
|
11
11
|
import torch
|
12
12
|
import os
|
13
13
|
|
@@ -215,7 +215,7 @@ class Engine:
|
|
215
215
|
|
216
216
|
return top_k_documents, all_docs_and_scores
|
217
217
|
|
218
|
-
def answer_query(self, query, top_k: int, model):
|
218
|
+
def answer_query(self, query, top_k: int, model, additional_document: dict = None, send_complete_pages_to_llm: bool = False):
|
219
219
|
"""
|
220
220
|
Answers a query using the top_k documents and the specified model.
|
221
221
|
|
@@ -223,6 +223,8 @@ class Engine:
|
|
223
223
|
query (str): The query string.
|
224
224
|
top_k (int): The number of top documents to use for answering the query.
|
225
225
|
model: The model to use for answering the query.
|
226
|
+
additional_document (dict, optional): An additional document to include in the search. Default is None.
|
227
|
+
send_complete_pages_to_llm (bool, optional): Whether to send complete pages to the
|
226
228
|
|
227
229
|
Returns:
|
228
230
|
tuple: A tuple containing the top k documents, the answer, and the stats.
|
@@ -230,18 +232,59 @@ class Engine:
|
|
230
232
|
before_retrieval = time.perf_counter()
|
231
233
|
top_k_documents, all_docs_and_scores = self.search_documents(query, top_k)
|
232
234
|
|
235
|
+
if send_complete_pages_to_llm:
|
236
|
+
top_k_documents = [self.transform_document_to_full_page(doc) for doc in top_k_documents]
|
237
|
+
|
238
|
+
if additional_document:
|
239
|
+
top_k_documents.append(additional_document)
|
233
240
|
|
234
241
|
retrieval_time = round(time.perf_counter() - before_retrieval, 4)
|
235
242
|
print(f"retrieval time: {retrieval_time}")
|
236
243
|
|
237
|
-
gpt_answer, gpt_elapsed, tokens = self.llms_client.answer(query, top_k_documents)
|
244
|
+
gpt_answer, gpt_elapsed, tokens, request_params = self.llms_client.answer(query, top_k_documents)
|
238
245
|
stats = {
|
239
246
|
"retrieval_time": retrieval_time,
|
240
247
|
"gpt_model": model,
|
241
248
|
"gpt_time": gpt_elapsed,
|
242
249
|
"tokens": tokens
|
243
250
|
}
|
244
|
-
return top_k_documents, gpt_answer, stats, all_docs_and_scores
|
251
|
+
return top_k_documents, gpt_answer, stats, all_docs_and_scores, request_params
|
252
|
+
|
253
|
+
def transform_document_to_full_page(self, document: dict) -> dict:
|
254
|
+
"""
|
255
|
+
Adds the full page content to the document by retrieving it from Elasticsearch.
|
256
|
+
|
257
|
+
Args:
|
258
|
+
document (dict): The document to which the full page content will be added.
|
259
|
+
|
260
|
+
Returns:
|
261
|
+
dict: The updated document with the full page content added.
|
262
|
+
"""
|
263
|
+
if not document.get("page_id"):
|
264
|
+
return document
|
265
|
+
full_document = self.get_full_document_by_page_id(document["page_id"])
|
266
|
+
if full_document and full_document.get("content"):
|
267
|
+
document["content"] = full_document["content"]
|
268
|
+
return document
|
269
|
+
|
270
|
+
def get_full_document_by_page_id(self, page_id: int) -> dict | None:
|
271
|
+
"""
|
272
|
+
Retrieves a unified document instance for a given page_id by searching all indices in Elasticsearch.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
page_id (int): The page ID to search for.
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
dict | None: A single dict representing the united document (with metadata and concatenated content),
|
279
|
+
or None if no documents are found.
|
280
|
+
"""
|
281
|
+
es_client = self.elastic_model.es_client
|
282
|
+
indices = es_client.indices.get_alias(index="*").keys()
|
283
|
+
parts_of_documents = find_page_id_in_all_indices(page_id=page_id, es_client=es_client, indices=indices)
|
284
|
+
if not parts_of_documents:
|
285
|
+
return None
|
286
|
+
full_document = unite_docs_to_single_instance(parts_of_documents)
|
287
|
+
return full_document
|
245
288
|
|
246
289
|
|
247
290
|
engine = None
|
@@ -0,0 +1,45 @@
|
|
1
|
+
from elasticsearch import Elasticsearch
|
2
|
+
from typing import List, Dict, Any
|
3
|
+
|
4
|
+
|
5
|
+
def find_page_id_in_all_indices(page_id: int, es_client: Elasticsearch, indices: List[str]) -> List[Dict[str, Any]]:
|
6
|
+
"""
|
7
|
+
Search all provided indices using the given es_client for documents with the given page_id.
|
8
|
+
Returns a list of all matching documents' _source fields.
|
9
|
+
"""
|
10
|
+
all_docs = []
|
11
|
+
fixed_indicies = ["_".join(index.split("_")[1:]) for index in indices]
|
12
|
+
for index in fixed_indicies:
|
13
|
+
resp = es_client.search(index=index, body={
|
14
|
+
"query": {
|
15
|
+
"term": {"page_id": page_id}
|
16
|
+
}
|
17
|
+
}, size=100)
|
18
|
+
hits = resp.get('hits', {}).get('hits', [])
|
19
|
+
all_docs.extend([doc['_source'] for doc in hits])
|
20
|
+
return all_docs
|
21
|
+
|
22
|
+
|
23
|
+
def unite_docs_to_single_instance(docs: List[Dict[str, Any]]) -> Dict[str, Any] | None:
|
24
|
+
"""
|
25
|
+
Unites a list of Elasticsearch document dicts (with the same page_id) into a single instance dict.
|
26
|
+
- Takes metadata fields (page_id, title, url, articleType, articleContentArea, etc.) from the first document.
|
27
|
+
- Concatenates all 'content' fields with a line break between them.
|
28
|
+
- Returns a single dict representing the united document, or None if docs is empty.
|
29
|
+
"""
|
30
|
+
if not docs:
|
31
|
+
return None
|
32
|
+
first = docs[0]
|
33
|
+
united_content = '\n'.join([doc.get('content', '') for doc in docs if doc.get('content')])
|
34
|
+
instance = {
|
35
|
+
"page_id": first.get("page_id"),
|
36
|
+
"title": first.get("title"),
|
37
|
+
"url": first.get("url"),
|
38
|
+
"link": first.get("url").split("/")[-1] if first.get("url") else None,
|
39
|
+
"articleType": first.get("articleType"),
|
40
|
+
"articleContentArea": first.get("articleContentArea"),
|
41
|
+
"summary": first.get("summary"),
|
42
|
+
"categories": first.get("categories"),
|
43
|
+
"content": united_content
|
44
|
+
}
|
45
|
+
return instance
|
@@ -7,5 +7,5 @@ class LLMClient(ABC):
|
|
7
7
|
def __init__(self):
|
8
8
|
self.field_for_answer = definitions.field_for_llm
|
9
9
|
@abstractmethod
|
10
|
-
def answer(self, _question, _top_k_docs) -> tuple[str, float, int]:
|
10
|
+
def answer(self, _question, _top_k_docs) -> tuple[str, float, int, dict]:
|
11
11
|
raise NotImplementedError
|
@@ -6,6 +6,7 @@ kolzchut_ragbot/IntegrateService.py
|
|
6
6
|
kolzchut_ragbot/__init__.py
|
7
7
|
kolzchut_ragbot/config.py
|
8
8
|
kolzchut_ragbot/engine.py
|
9
|
+
kolzchut_ragbot/get_full_documents_utilities.py
|
9
10
|
kolzchut_ragbot/llm_client.py
|
10
11
|
kolzchut_ragbot/model.py
|
11
12
|
kolzchut_ragbot.egg-info/PKG-INFO
|
@@ -7,7 +7,7 @@ from setuptools import setup, find_packages
|
|
7
7
|
|
8
8
|
setup(
|
9
9
|
name='kolzchut-ragbot',
|
10
|
-
version='1.
|
10
|
+
version='1.6.1',
|
11
11
|
author='Shmuel Robinov',
|
12
12
|
author_email='shmuel_robinov@webiks.com',
|
13
13
|
description='A search engine using machine learning models and Elasticsearch for advanced document retrieval.',
|
@@ -135,8 +135,8 @@ class TestEngine:
|
|
135
135
|
{'page_id': 5, 'title': 'title5'}
|
136
136
|
], {})
|
137
137
|
|
138
|
-
llm_client.answer.return_value = ('answer', 0.5, 100)
|
139
|
-
actual_top_k_documents, actual_gpt_answer, actual_stats,
|
138
|
+
llm_client.answer.return_value = ('answer', 0.5, 100, {})
|
139
|
+
actual_top_k_documents, actual_gpt_answer, actual_stats, all_docs_and_score, request_params = engine.answer_query("test query", 5, 'gpt-4o')
|
140
140
|
|
141
141
|
expected_top_k_documents = [
|
142
142
|
{'page_id': 3, 'title': 'title3'},
|
@@ -156,4 +156,3 @@ class TestEngine:
|
|
156
156
|
assert expected_top_k_documents == actual_top_k_documents
|
157
157
|
assert expected_gpt_answer == actual_gpt_answer
|
158
158
|
assert expected_stats == actual_stats
|
159
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{kolzchut_ragbot-1.5.0 → kolzchut_ragbot-1.6.1}/kolzchut_ragbot.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|