kolzchut-ragbot 1.4.2__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kolzchut_ragbot/Document.py +101 -101
- kolzchut_ragbot/IntegrateService.py +4 -4
- kolzchut_ragbot/__init__.py +0 -2
- kolzchut_ragbot/config.py +5 -5
- kolzchut_ragbot/engine.py +333 -246
- kolzchut_ragbot/get_full_documents_utilities.py +45 -0
- kolzchut_ragbot/llm_client.py +11 -11
- kolzchut_ragbot/model.py +182 -182
- kolzchut_ragbot-1.6.0.dist-info/METADATA +76 -0
- kolzchut_ragbot-1.6.0.dist-info/RECORD +12 -0
- {kolzchut_ragbot-1.4.2.dist-info → kolzchut_ragbot-1.6.0.dist-info}/WHEEL +1 -1
- kolzchut_ragbot-1.4.2.dist-info/METADATA +0 -67
- kolzchut_ragbot-1.4.2.dist-info/RECORD +0 -11
- {kolzchut_ragbot-1.4.2.dist-info → kolzchut_ragbot-1.6.0.dist-info}/top_level.txt +0 -0
kolzchut_ragbot/engine.py
CHANGED
|
@@ -1,246 +1,333 @@
|
|
|
1
|
-
import time
|
|
2
|
-
from collections import defaultdict
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from .llm_client import LLMClient
|
|
5
|
-
from . import config
|
|
6
|
-
from .model import es_client_factory
|
|
7
|
-
from .Document import factory
|
|
8
|
-
from sentence_transformers import SentenceTransformer
|
|
9
|
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
10
|
-
|
|
11
|
-
import torch
|
|
12
|
-
import os
|
|
13
|
-
|
|
14
|
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
15
|
-
definitions = factory()
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class Engine:
|
|
19
|
-
"""
|
|
20
|
-
Engine class for handling document search and retrieval using Elasticsearch and LLMs.
|
|
21
|
-
|
|
22
|
-
Attributes:
|
|
23
|
-
llms_client (LLMClient): The LLM client instance.
|
|
24
|
-
elastic_model (Model): The Elasticsearch model instance.
|
|
25
|
-
models (dict): A dictionary of SentenceTransformer models.
|
|
26
|
-
reranker_tokenizer (AutoTokenizer): The tokenizer for the reranker model.
|
|
27
|
-
reranker_model (AutoModelForSequenceClassification): The reranker model.
|
|
28
|
-
identifier_field (str): The identifier field for documents.
|
|
29
|
-
|
|
30
|
-
Methods:
|
|
31
|
-
rerank_with_me5(query, documents, k=5):
|
|
32
|
-
Reranks documents based on the query using the reranker model.
|
|
33
|
-
|
|
34
|
-
update_docs(list_of_docs, embed_only_fields=None, delete_existing=False):
|
|
35
|
-
Updates or creates documents in the Elasticsearch index.
|
|
36
|
-
|
|
37
|
-
reciprocal_rank_fusion(ranking_lists, k=60, weights=None):
|
|
38
|
-
Performs Reciprocal Rank Fusion on a list of ranking lists.
|
|
39
|
-
|
|
40
|
-
search_documents(query, top_k):
|
|
41
|
-
Searches for documents based on the query and returns the top_k results.
|
|
42
|
-
|
|
43
|
-
answer_query(query, top_k, model):
|
|
44
|
-
Answers a query using the top_k documents and the specified model.
|
|
45
|
-
"""
|
|
46
|
-
|
|
47
|
-
def __init__(self, llms_client: LLMClient, elastic_model=None, models=None, reranker_tokenizer=None,
|
|
48
|
-
reranker_model=None, es_client=None):
|
|
49
|
-
"""
|
|
50
|
-
Initializes the Engine instance.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
llms_client (LLMClient): The LLM client instance.
|
|
54
|
-
elastic_model (Model, optional): The Elasticsearch model instance. Default is None.
|
|
55
|
-
models (dict, optional): A dictionary of SentenceTransformer models. Default is None.
|
|
56
|
-
reranker_tokenizer (AutoTokenizer, optional): The tokenizer for the reranker model. Default is None.
|
|
57
|
-
reranker_model (AutoModelForSequenceClassification, optional): The reranker model. Default is None.
|
|
58
|
-
es_client (optional): The Elasticsearch client instance. Default is None.
|
|
59
|
-
"""
|
|
60
|
-
if elastic_model is None:
|
|
61
|
-
self.elastic_model = es_client_factory(es_client)
|
|
62
|
-
else:
|
|
63
|
-
self.elastic_model = elastic_model
|
|
64
|
-
|
|
65
|
-
self.llms_client = llms_client
|
|
66
|
-
|
|
67
|
-
self.identifier_field = factory().identifier
|
|
68
|
-
|
|
69
|
-
if models is None:
|
|
70
|
-
self.models = {f"{model_name}": SentenceTransformer(config.MODELS_LOCATION + "/" + model_name).to(device)
|
|
71
|
-
for model_name in definitions.models.keys()}
|
|
72
|
-
else:
|
|
73
|
-
self.models = models
|
|
74
|
-
for model in self.models.values():
|
|
75
|
-
model.eval()
|
|
76
|
-
|
|
77
|
-
if reranker_tokenizer is None:
|
|
78
|
-
self.reranker_tokenizer = AutoTokenizer.from_pretrained(os.getenv("TOKENIZER_LOCATION"))
|
|
79
|
-
else:
|
|
80
|
-
self.reranker_tokenizer = reranker_tokenizer
|
|
81
|
-
|
|
82
|
-
if reranker_model is None:
|
|
83
|
-
self.reranker_model = AutoModelForSequenceClassification.from_pretrained(os.getenv("TOKENIZER_LOCATION"))
|
|
84
|
-
else:
|
|
85
|
-
self.reranker_model = reranker_model
|
|
86
|
-
self.reranker_model.eval()
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
for
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
1
|
+
import time
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from .llm_client import LLMClient
|
|
5
|
+
from . import config
|
|
6
|
+
from .model import es_client_factory
|
|
7
|
+
from .Document import factory
|
|
8
|
+
from sentence_transformers import SentenceTransformer
|
|
9
|
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
10
|
+
from .get_full_documents_utilities import find_page_id_in_all_indices, unite_docs_to_single_instance
|
|
11
|
+
import torch
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
15
|
+
definitions = factory()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Engine:
|
|
19
|
+
"""
|
|
20
|
+
Engine class for handling document search and retrieval using Elasticsearch and LLMs.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
llms_client (LLMClient): The LLM client instance.
|
|
24
|
+
elastic_model (Model): The Elasticsearch model instance.
|
|
25
|
+
models (dict): A dictionary of SentenceTransformer models.
|
|
26
|
+
reranker_tokenizer (AutoTokenizer): The tokenizer for the reranker model.
|
|
27
|
+
reranker_model (AutoModelForSequenceClassification): The reranker model.
|
|
28
|
+
identifier_field (str): The identifier field for documents.
|
|
29
|
+
|
|
30
|
+
Methods:
|
|
31
|
+
rerank_with_me5(query, documents, k=5):
|
|
32
|
+
Reranks documents based on the query using the reranker model.
|
|
33
|
+
|
|
34
|
+
update_docs(list_of_docs, embed_only_fields=None, delete_existing=False):
|
|
35
|
+
Updates or creates documents in the Elasticsearch index.
|
|
36
|
+
|
|
37
|
+
reciprocal_rank_fusion(ranking_lists, k=60, weights=None):
|
|
38
|
+
Performs Reciprocal Rank Fusion on a list of ranking lists.
|
|
39
|
+
|
|
40
|
+
search_documents(query, top_k):
|
|
41
|
+
Searches for documents based on the query and returns the top_k results.
|
|
42
|
+
|
|
43
|
+
answer_query(query, top_k, model):
|
|
44
|
+
Answers a query using the top_k documents and the specified model.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, llms_client: LLMClient, elastic_model=None, models=None, reranker_tokenizer=None,
|
|
48
|
+
reranker_model=None, es_client=None):
|
|
49
|
+
"""
|
|
50
|
+
Initializes the Engine instance.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
llms_client (LLMClient): The LLM client instance.
|
|
54
|
+
elastic_model (Model, optional): The Elasticsearch model instance. Default is None.
|
|
55
|
+
models (dict, optional): A dictionary of SentenceTransformer models. Default is None.
|
|
56
|
+
reranker_tokenizer (AutoTokenizer, optional): The tokenizer for the reranker model. Default is None.
|
|
57
|
+
reranker_model (AutoModelForSequenceClassification, optional): The reranker model. Default is None.
|
|
58
|
+
es_client (optional): The Elasticsearch client instance. Default is None.
|
|
59
|
+
"""
|
|
60
|
+
if elastic_model is None:
|
|
61
|
+
self.elastic_model = es_client_factory(es_client)
|
|
62
|
+
else:
|
|
63
|
+
self.elastic_model = elastic_model
|
|
64
|
+
|
|
65
|
+
self.llms_client = llms_client
|
|
66
|
+
|
|
67
|
+
self.identifier_field = factory().identifier
|
|
68
|
+
|
|
69
|
+
if models is None:
|
|
70
|
+
self.models = {f"{model_name}": SentenceTransformer(config.MODELS_LOCATION + "/" + model_name).to(device)
|
|
71
|
+
for model_name in definitions.models.keys()}
|
|
72
|
+
else:
|
|
73
|
+
self.models = models
|
|
74
|
+
for model in self.models.values():
|
|
75
|
+
model.eval()
|
|
76
|
+
|
|
77
|
+
if reranker_tokenizer is None:
|
|
78
|
+
self.reranker_tokenizer = AutoTokenizer.from_pretrained(os.getenv("TOKENIZER_LOCATION"))
|
|
79
|
+
else:
|
|
80
|
+
self.reranker_tokenizer = reranker_tokenizer
|
|
81
|
+
|
|
82
|
+
if reranker_model is None:
|
|
83
|
+
self.reranker_model = AutoModelForSequenceClassification.from_pretrained(os.getenv("TOKENIZER_LOCATION"))
|
|
84
|
+
else:
|
|
85
|
+
self.reranker_model = reranker_model
|
|
86
|
+
self.reranker_model.eval()
|
|
87
|
+
|
|
88
|
+
def change_llm(self, llms_client: LLMClient):
|
|
89
|
+
"""
|
|
90
|
+
Changes the LLM client for the Engine instance.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
llms_client (LLMClient): The new LLM client instance.
|
|
94
|
+
"""
|
|
95
|
+
self.llms_client = llms_client
|
|
96
|
+
|
|
97
|
+
def rerank_with_me5(self, query, documents, k=5):
|
|
98
|
+
"""
|
|
99
|
+
Reranks documents based on the query using the reranker model.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
query (str): The query string.
|
|
103
|
+
documents (list): A list of documents to be reranked.
|
|
104
|
+
k (int, optional): The number of top documents to return. Default is 5.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
list: A list of top k reranked documents.
|
|
108
|
+
"""
|
|
109
|
+
pairs = [(query, doc) for doc in set(documents)]
|
|
110
|
+
inputs = self.reranker_tokenizer(pairs, return_tensors='pt', padding=True, truncation=True, max_length=512)
|
|
111
|
+
|
|
112
|
+
# Make predictions
|
|
113
|
+
with torch.no_grad():
|
|
114
|
+
outputs = self.reranker_model(**inputs)
|
|
115
|
+
|
|
116
|
+
scores = outputs.logits.squeeze()
|
|
117
|
+
|
|
118
|
+
if scores.ndim > 1:
|
|
119
|
+
scores = scores[:, 1] # Assuming binary classification and index 1 is the relevance score
|
|
120
|
+
|
|
121
|
+
sorted_indices = torch.argsort(scores, descending=True)
|
|
122
|
+
# If there is only one document, return it to avoid torch error
|
|
123
|
+
if len(sorted_indices) == 1:
|
|
124
|
+
return [pairs[0][1]]
|
|
125
|
+
# Sort documents by their highest score
|
|
126
|
+
sorted_docs = [pairs[i][1] for i in sorted_indices]
|
|
127
|
+
return sorted_docs[:k]
|
|
128
|
+
|
|
129
|
+
def update_docs(self, list_of_docs: list[dict], embed_only_fields=None, delete_existing=False):
|
|
130
|
+
"""
|
|
131
|
+
Updates or creates documents in the Elasticsearch index.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
list_of_docs (list[dict]): A list of dictionaries representing the documents to be indexed.
|
|
135
|
+
embed_only_fields (list, optional): A list of fields to be embedded. Default is None.
|
|
136
|
+
delete_existing (bool, optional): Whether to delete existing documents. Default is False.
|
|
137
|
+
"""
|
|
138
|
+
embed_only_fields = embed_only_fields or definitions.models.values()
|
|
139
|
+
for doc in list_of_docs:
|
|
140
|
+
for semantic_model, field in definitions.models.items():
|
|
141
|
+
if field in doc.keys() and field in embed_only_fields:
|
|
142
|
+
content_vectors = self.models[semantic_model].encode(doc[field])
|
|
143
|
+
doc[f'{field}_{semantic_model}_vectors'] = content_vectors
|
|
144
|
+
|
|
145
|
+
doc['last_update'] = datetime.now()
|
|
146
|
+
self.elastic_model.create_or_update_documents(list_of_docs, delete_existing)
|
|
147
|
+
|
|
148
|
+
def reciprocal_rank_fusion(self, ranking_lists, k=60, weights=None):
|
|
149
|
+
"""
|
|
150
|
+
Performs Reciprocal Rank Fusion on a list of ranking lists.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
:param ranking_lists: List of ranking lists, where each ranking list is a list of documents returned by a model.
|
|
154
|
+
:param k: The parameter for the reciprocal rank calculation (default is 60).
|
|
155
|
+
:param: weights: Optional. Weights for each ranking list.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
list: A fused ranking list of documents.
|
|
159
|
+
"""
|
|
160
|
+
scores = defaultdict(float)
|
|
161
|
+
|
|
162
|
+
for list_index, rank_list in enumerate(ranking_lists):
|
|
163
|
+
for rank, identifier in enumerate(rank_list):
|
|
164
|
+
# Reciprocal rank score
|
|
165
|
+
w = weights[list_index] if weights else 1
|
|
166
|
+
scores[identifier] += w / (k + rank + 1)
|
|
167
|
+
|
|
168
|
+
# Sort the documents by their cumulative scores in descending order
|
|
169
|
+
fused_list = sorted(scores, key=scores.get, reverse=True)
|
|
170
|
+
|
|
171
|
+
return fused_list
|
|
172
|
+
|
|
173
|
+
def search_documents(self, query: str, top_k: int):
|
|
174
|
+
"""
|
|
175
|
+
Searches for documents based on the query and returns the top_k results.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
query (str): The query string.
|
|
179
|
+
top_k (int): The number of top documents to return.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
list: A list of top k documents.
|
|
183
|
+
"""
|
|
184
|
+
query_embeddings = {f"{semantic_model}": self.models[semantic_model].encode(query) for semantic_model in
|
|
185
|
+
definitions.models.keys()}
|
|
186
|
+
all_docs_by_model = self.elastic_model.search(query_embeddings)
|
|
187
|
+
all_docs = []
|
|
188
|
+
ids_for_fusion = []
|
|
189
|
+
all_docs_and_scores = {}
|
|
190
|
+
|
|
191
|
+
for key, values in all_docs_by_model.items():
|
|
192
|
+
print(f"\nFound {len(values)} documents for model\n")
|
|
193
|
+
model_ids = []
|
|
194
|
+
scores_for_model = []
|
|
195
|
+
|
|
196
|
+
for doc in values:
|
|
197
|
+
model_ids.append(doc["_source"]["page_id"])
|
|
198
|
+
all_docs.append(doc)
|
|
199
|
+
scores_for_model.append({"doc": doc["_source"]["title"], "score": doc["_score"]})
|
|
200
|
+
ids_for_fusion.append(model_ids)
|
|
201
|
+
all_docs_and_scores[f'{key}'] = scores_for_model
|
|
202
|
+
print(f"\nFusing {len(ids_for_fusion)} results\n")
|
|
203
|
+
fused_ids = self.reciprocal_rank_fusion(ids_for_fusion, k=top_k)
|
|
204
|
+
top_k_documents = []
|
|
205
|
+
top_titles = []
|
|
206
|
+
|
|
207
|
+
for fused_id in fused_ids:
|
|
208
|
+
for doc in all_docs:
|
|
209
|
+
if doc["_source"]["page_id"] == fused_id and doc["_source"]["title"] not in top_titles:
|
|
210
|
+
top_k_documents.append(doc["_source"])
|
|
211
|
+
top_titles.append(doc["_source"]["title"])
|
|
212
|
+
break
|
|
213
|
+
if len(top_titles) >= top_k:
|
|
214
|
+
break
|
|
215
|
+
|
|
216
|
+
return top_k_documents, all_docs_and_scores
|
|
217
|
+
|
|
218
|
+
def answer_query(self, query, top_k: int, model, additional_document: dict = None, send_complete_pages_to_llm: bool = False):
|
|
219
|
+
"""
|
|
220
|
+
Answers a query using the top_k documents and the specified model.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
query (str): The query string.
|
|
224
|
+
top_k (int): The number of top documents to use for answering the query.
|
|
225
|
+
model: The model to use for answering the query.
|
|
226
|
+
additional_document (dict, optional): An additional document to include in the search. Default is None.
|
|
227
|
+
send_complete_pages_to_llm (bool, optional): Whether to send complete pages to the
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
tuple: A tuple containing the top k documents, the answer, and the stats.
|
|
231
|
+
"""
|
|
232
|
+
before_retrieval = time.perf_counter()
|
|
233
|
+
top_k_documents, all_docs_and_scores = self.search_documents(query, top_k)
|
|
234
|
+
|
|
235
|
+
if send_complete_pages_to_llm:
|
|
236
|
+
top_k_documents = [self.transform_document_to_full_page(doc) for doc in top_k_documents]
|
|
237
|
+
|
|
238
|
+
if additional_document:
|
|
239
|
+
top_k_documents.append(additional_document)
|
|
240
|
+
|
|
241
|
+
retrieval_time = round(time.perf_counter() - before_retrieval, 4)
|
|
242
|
+
print(f"retrieval time: {retrieval_time}")
|
|
243
|
+
|
|
244
|
+
gpt_answer, gpt_elapsed, tokens, request_params = self.llms_client.answer(query, top_k_documents)
|
|
245
|
+
stats = {
|
|
246
|
+
"retrieval_time": retrieval_time,
|
|
247
|
+
"gpt_model": model,
|
|
248
|
+
"gpt_time": gpt_elapsed,
|
|
249
|
+
"tokens": tokens
|
|
250
|
+
}
|
|
251
|
+
return top_k_documents, gpt_answer, stats, all_docs_and_scores, request_params
|
|
252
|
+
|
|
253
|
+
def transform_document_to_full_page(self, document: dict) -> dict:
|
|
254
|
+
"""
|
|
255
|
+
Adds the full page content to the document by retrieving it from Elasticsearch.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
document (dict): The document to which the full page content will be added.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
dict: The updated document with the full page content added.
|
|
262
|
+
"""
|
|
263
|
+
if not document.get("page_id"):
|
|
264
|
+
return document
|
|
265
|
+
full_document = self.get_full_document_by_page_id(document["page_id"])
|
|
266
|
+
if full_document and full_document.get("content"):
|
|
267
|
+
document["content"] = full_document["content"]
|
|
268
|
+
return document
|
|
269
|
+
|
|
270
|
+
def get_full_document_by_page_id(self, page_id: int) -> dict | None:
|
|
271
|
+
"""
|
|
272
|
+
Retrieves a unified document instance for a given page_id by searching all indices in Elasticsearch.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
page_id (int): The page ID to search for.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
dict | None: A single dict representing the united document (with metadata and concatenated content),
|
|
279
|
+
or None if no documents are found.
|
|
280
|
+
"""
|
|
281
|
+
es_client = self.elastic_model.es_client
|
|
282
|
+
indices = es_client.indices.get_alias(index="*").keys()
|
|
283
|
+
parts_of_documents = find_page_id_in_all_indices(page_id=page_id, es_client=es_client, indices=indices)
|
|
284
|
+
if not parts_of_documents:
|
|
285
|
+
return None
|
|
286
|
+
full_document = unite_docs_to_single_instance(parts_of_documents)
|
|
287
|
+
return full_document
|
|
288
|
+
|
|
289
|
+
def transform_document_to_full_page(self, document: dict) -> dict:
|
|
290
|
+
"""
|
|
291
|
+
Adds the full page content to the document by retrieving it from Elasticsearch.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
document (dict): The document to which the full page content will be added.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
dict: The updated document with the full page content added.
|
|
298
|
+
"""
|
|
299
|
+
if not document.get("page_id"):
|
|
300
|
+
return document
|
|
301
|
+
full_document = self.get_full_document_by_page_id(document["page_id"])
|
|
302
|
+
if full_document and full_document.get("content"):
|
|
303
|
+
document["content"] = full_document["content"]
|
|
304
|
+
return document
|
|
305
|
+
|
|
306
|
+
def get_full_document_by_page_id(self, page_id: int) -> dict | None:
|
|
307
|
+
"""
|
|
308
|
+
Retrieves a unified document instance for a given page_id by searching all indices in Elasticsearch.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
page_id (int): The page ID to search for.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
dict | None: A single dict representing the united document (with metadata and concatenated content),
|
|
315
|
+
or None if no documents are found.
|
|
316
|
+
"""
|
|
317
|
+
es_client = self.elastic_model.es_client
|
|
318
|
+
indices = es_client.indices.get_alias(index="*").keys()
|
|
319
|
+
parts_of_documents = find_page_id_in_all_indices(page_id=page_id, es_client=es_client, indices=indices)
|
|
320
|
+
if not parts_of_documents:
|
|
321
|
+
return None
|
|
322
|
+
full_document = unite_docs_to_single_instance(parts_of_documents)
|
|
323
|
+
return full_document
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
engine = None
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def engine_factory(llms_client: LLMClient, es_client=None):
|
|
330
|
+
global engine
|
|
331
|
+
if engine is None:
|
|
332
|
+
engine = Engine(llms_client=llms_client, es_client=es_client)
|
|
333
|
+
return engine
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from elasticsearch import Elasticsearch
|
|
2
|
+
from typing import List, Dict, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def find_page_id_in_all_indices(page_id: int, es_client: Elasticsearch, indices: List[str]) -> List[Dict[str, Any]]:
|
|
6
|
+
"""
|
|
7
|
+
Search all provided indices using the given es_client for documents with the given page_id.
|
|
8
|
+
Returns a list of all matching documents' _source fields.
|
|
9
|
+
"""
|
|
10
|
+
all_docs = []
|
|
11
|
+
fixed_indicies = ["_".join(index.split("_")[1:]) for index in indices]
|
|
12
|
+
for index in fixed_indicies:
|
|
13
|
+
resp = es_client.search(index=index, body={
|
|
14
|
+
"query": {
|
|
15
|
+
"term": {"page_id": page_id}
|
|
16
|
+
}
|
|
17
|
+
}, size=100)
|
|
18
|
+
hits = resp.get('hits', {}).get('hits', [])
|
|
19
|
+
all_docs.extend([doc['_source'] for doc in hits])
|
|
20
|
+
return all_docs
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def unite_docs_to_single_instance(docs: List[Dict[str, Any]]) -> Dict[str, Any] | None:
|
|
24
|
+
"""
|
|
25
|
+
Unites a list of Elasticsearch document dicts (with the same page_id) into a single instance dict.
|
|
26
|
+
- Takes metadata fields (page_id, title, url, articleType, articleContentArea, etc.) from the first document.
|
|
27
|
+
- Concatenates all 'content' fields with a line break between them.
|
|
28
|
+
- Returns a single dict representing the united document, or None if docs is empty.
|
|
29
|
+
"""
|
|
30
|
+
if not docs:
|
|
31
|
+
return None
|
|
32
|
+
first = docs[0]
|
|
33
|
+
united_content = '\n'.join([doc.get('content', '') for doc in docs if doc.get('content')])
|
|
34
|
+
instance = {
|
|
35
|
+
"page_id": first.get("page_id"),
|
|
36
|
+
"title": first.get("title"),
|
|
37
|
+
"url": first.get("url"),
|
|
38
|
+
"link": first.get("url").split("/")[-1] if first.get("url") else None,
|
|
39
|
+
"articleType": first.get("articleType"),
|
|
40
|
+
"articleContentArea": first.get("articleContentArea"),
|
|
41
|
+
"summary": first.get("summary"),
|
|
42
|
+
"categories": first.get("categories"),
|
|
43
|
+
"content": united_content
|
|
44
|
+
}
|
|
45
|
+
return instance
|
kolzchut_ragbot/llm_client.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
from .Document import factory
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
definitions = factory()
|
|
4
|
-
|
|
5
|
-
class LLMClient(ABC):
|
|
6
|
-
@abstractmethod
|
|
7
|
-
def __init__(self):
|
|
8
|
-
self.field_for_answer = definitions.field_for_llm
|
|
9
|
-
@abstractmethod
|
|
10
|
-
def answer(self, _question, _top_k_docs) -> tuple[str, float, int]:
|
|
11
|
-
raise NotImplementedError
|
|
1
|
+
from .Document import factory
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
definitions = factory()
|
|
4
|
+
|
|
5
|
+
class LLMClient(ABC):
|
|
6
|
+
@abstractmethod
|
|
7
|
+
def __init__(self):
|
|
8
|
+
self.field_for_answer = definitions.field_for_llm
|
|
9
|
+
@abstractmethod
|
|
10
|
+
def answer(self, _question, _top_k_docs) -> tuple[str, float, int]:
|
|
11
|
+
raise NotImplementedError
|