nucliadb 6.3.4.post3821__py3-none-any.whl → 6.3.4.post3836__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/search/search/query_parser/fetcher.py +104 -133
- {nucliadb-6.3.4.post3821.dist-info → nucliadb-6.3.4.post3836.dist-info}/METADATA +6 -6
- {nucliadb-6.3.4.post3821.dist-info → nucliadb-6.3.4.post3836.dist-info}/RECORD +6 -6
- {nucliadb-6.3.4.post3821.dist-info → nucliadb-6.3.4.post3836.dist-info}/WHEEL +0 -0
- {nucliadb-6.3.4.post3821.dist-info → nucliadb-6.3.4.post3836.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.3.4.post3821.dist-info → nucliadb-6.3.4.post3836.dist-info}/top_level.txt +0 -0
@@ -17,6 +17,7 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
+
import asyncio
|
20
21
|
from typing import Optional, TypeVar, Union
|
21
22
|
|
22
23
|
from async_lru import alru_cache
|
@@ -53,12 +54,9 @@ def is_cached(field: Union[T, NotCached]) -> TypeIs[T]:
|
|
53
54
|
|
54
55
|
class FetcherCache:
|
55
56
|
predict_query_info: Union[Optional[QueryInfo], NotCached] = not_cached
|
56
|
-
predict_detected_entities: Union[list[utils_pb2.RelationNode], NotCached] = not_cached
|
57
57
|
|
58
58
|
# semantic search
|
59
|
-
query_vector: Union[Optional[list[float]], NotCached] = not_cached
|
60
59
|
vectorset: Union[str, NotCached] = not_cached
|
61
|
-
matryoshka_dimension: Union[Optional[int], NotCached] = not_cached
|
62
60
|
|
63
61
|
labels: Union[knowledgebox_pb2.Labels, NotCached] = not_cached
|
64
62
|
|
@@ -97,103 +95,80 @@ class Fetcher:
|
|
97
95
|
self.query = query
|
98
96
|
self.user_vector = user_vector
|
99
97
|
self.user_vectorset = vectorset
|
98
|
+
self.user_vectorset_validated = False
|
100
99
|
self.rephrase = rephrase
|
101
100
|
self.rephrase_prompt = rephrase_prompt
|
102
101
|
self.generative_model = generative_model
|
103
102
|
|
104
103
|
self.cache = FetcherCache()
|
105
|
-
self.
|
106
|
-
|
107
|
-
# Validation
|
108
|
-
|
109
|
-
async def initial_validate(self):
|
110
|
-
"""Runs a validation on the input parameters. It can raise errors if
|
111
|
-
there's some wrong parameter.
|
112
|
-
|
113
|
-
This function should be always called if validated input for fetching is
|
114
|
-
desired
|
115
|
-
"""
|
116
|
-
if self._validated:
|
117
|
-
return
|
118
|
-
|
119
|
-
self._validated = True
|
120
|
-
|
121
|
-
async def _validate_vectorset(self):
|
122
|
-
if self.user_vectorset is not None:
|
123
|
-
await validate_vectorset(self.kbid, self.user_vectorset)
|
104
|
+
self.locks: dict[str, asyncio.Lock] = {}
|
124
105
|
|
125
106
|
# Semantic search
|
126
107
|
|
127
108
|
async def get_matryoshka_dimension(self) -> Optional[int]:
|
128
|
-
if is_cached(self.cache.matryoshka_dimension):
|
129
|
-
return self.cache.matryoshka_dimension
|
130
|
-
|
131
109
|
vectorset = await self.get_vectorset()
|
132
|
-
|
133
|
-
self.cache.matryoshka_dimension = matryoshka_dimension
|
134
|
-
return matryoshka_dimension
|
110
|
+
return await get_matryoshka_dimension_cached(self.kbid, vectorset)
|
135
111
|
|
136
112
|
async def _get_user_vectorset(self) -> Optional[str]:
|
137
113
|
"""Returns the user's requested vectorset and validates if it does exist
|
138
114
|
in the KB.
|
139
115
|
|
140
116
|
"""
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
117
|
+
async with self.locks.setdefault("user_vectorset", asyncio.Lock()):
|
118
|
+
if not self.user_vectorset_validated:
|
119
|
+
if self.user_vectorset is not None:
|
120
|
+
await validate_vectorset(self.kbid, self.user_vectorset)
|
121
|
+
self.user_vectorset_validated = True
|
122
|
+
return self.user_vectorset
|
145
123
|
|
146
124
|
async def get_vectorset(self) -> str:
|
147
125
|
"""Get the vectorset to be used in the search. If not specified, by the
|
148
126
|
user, Predict API or the own uses KB will provide a default.
|
149
127
|
|
150
128
|
"""
|
129
|
+
async with self.locks.setdefault("vectorset", asyncio.Lock()):
|
130
|
+
if is_cached(self.cache.vectorset):
|
131
|
+
return self.cache.vectorset
|
151
132
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
self.cache.vectorset = self.user_vectorset
|
158
|
-
return self.user_vectorset
|
133
|
+
user_vectorset = await self._get_user_vectorset()
|
134
|
+
if user_vectorset:
|
135
|
+
# user explicitly asked for a vectorset
|
136
|
+
self.cache.vectorset = user_vectorset
|
137
|
+
return user_vectorset
|
159
138
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
vectorset = None
|
164
|
-
else:
|
165
|
-
if query_info.sentence is None:
|
166
|
-
logger.error(
|
167
|
-
"Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
|
168
|
-
)
|
139
|
+
# when it's not provided, we get the default from Predict API
|
140
|
+
query_info = await self._predict_query_endpoint()
|
141
|
+
if query_info is None:
|
169
142
|
vectorset = None
|
170
143
|
else:
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
144
|
+
if query_info.sentence is None:
|
145
|
+
logger.error(
|
146
|
+
"Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
|
147
|
+
)
|
148
|
+
vectorset = None
|
149
|
+
else:
|
150
|
+
# vectors field is enforced by the data model to have at least one key
|
151
|
+
for vectorset in query_info.sentence.vectors.keys():
|
152
|
+
vectorset = vectorset
|
153
|
+
break
|
154
|
+
|
155
|
+
if vectorset is None:
|
156
|
+
# in case predict don't answer which vectorset to use, fallback to
|
157
|
+
# the first vectorset of the KB
|
158
|
+
async with datamanagers.with_ro_transaction() as txn:
|
159
|
+
async for vectorset, _ in datamanagers.vectorsets.iter(txn, kbid=self.kbid):
|
160
|
+
break
|
161
|
+
assert vectorset is not None, "All KBs must have at least one vectorset in maindb"
|
162
|
+
|
163
|
+
self.cache.vectorset = vectorset
|
164
|
+
return vectorset
|
186
165
|
|
187
166
|
async def get_query_vector(self) -> Optional[list[float]]:
|
188
|
-
if is_cached(self.cache.query_vector):
|
189
|
-
return self.cache.query_vector
|
190
|
-
|
191
167
|
if self.user_vector is not None:
|
192
168
|
query_vector = self.user_vector
|
193
169
|
else:
|
194
170
|
query_info = await self._predict_query_endpoint()
|
195
171
|
if query_info is None or query_info.sentence is None:
|
196
|
-
self.cache.query_vector = None
|
197
172
|
return None
|
198
173
|
|
199
174
|
vectorset = await self.get_vectorset()
|
@@ -206,7 +181,6 @@ class Fetcher:
|
|
206
181
|
"predict_vectorsets": ",".join(query_info.sentence.vectors.keys()),
|
207
182
|
},
|
208
183
|
)
|
209
|
-
self.cache.query_vector = None
|
210
184
|
return None
|
211
185
|
|
212
186
|
query_vector = query_info.sentence.vectors[vectorset]
|
@@ -223,7 +197,6 @@ class Fetcher:
|
|
223
197
|
# accordingly
|
224
198
|
query_vector = query_vector[:matryoshka_dimension]
|
225
199
|
|
226
|
-
self.cache.query_vector = query_vector
|
227
200
|
return query_vector
|
228
201
|
|
229
202
|
async def get_rephrased_query(self) -> Optional[str]:
|
@@ -235,100 +208,98 @@ class Fetcher:
|
|
235
208
|
# Labels
|
236
209
|
|
237
210
|
async def get_classification_labels(self) -> knowledgebox_pb2.Labels:
|
238
|
-
|
239
|
-
|
211
|
+
async with self.locks.setdefault("classification_labels", asyncio.Lock()):
|
212
|
+
if is_cached(self.cache.labels):
|
213
|
+
return self.cache.labels
|
240
214
|
|
241
|
-
|
242
|
-
|
243
|
-
|
215
|
+
labels = await get_classification_labels(self.kbid)
|
216
|
+
self.cache.labels = labels
|
217
|
+
return labels
|
244
218
|
|
245
219
|
# Entities
|
246
220
|
|
247
221
|
async def get_entities_meta_cache(self) -> datamanagers.entities.EntitiesMetaCache:
|
248
|
-
|
249
|
-
|
222
|
+
async with self.locks.setdefault("entities_meta_cache", asyncio.Lock()):
|
223
|
+
if is_cached(self.cache.entities_meta_cache):
|
224
|
+
return self.cache.entities_meta_cache
|
250
225
|
|
251
|
-
|
252
|
-
|
253
|
-
|
226
|
+
entities_meta_cache = await get_entities_meta_cache(self.kbid)
|
227
|
+
self.cache.entities_meta_cache = entities_meta_cache
|
228
|
+
return entities_meta_cache
|
254
229
|
|
255
230
|
async def get_deleted_entity_groups(self) -> list[str]:
|
256
|
-
|
257
|
-
|
231
|
+
async with self.locks.setdefault("deleted_entity_groups", asyncio.Lock()):
|
232
|
+
if is_cached(self.cache.deleted_entity_groups):
|
233
|
+
return self.cache.deleted_entity_groups
|
258
234
|
|
259
|
-
|
260
|
-
|
261
|
-
|
235
|
+
deleted_entity_groups = await get_deleted_entity_groups(self.kbid)
|
236
|
+
self.cache.deleted_entity_groups = deleted_entity_groups
|
237
|
+
return deleted_entity_groups
|
262
238
|
|
263
239
|
async def get_detected_entities(self) -> list[utils_pb2.RelationNode]:
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
240
|
+
async with self.locks.setdefault("detected_entities", asyncio.Lock()):
|
241
|
+
if is_cached(self.cache.detected_entities):
|
242
|
+
return self.cache.detected_entities
|
243
|
+
|
244
|
+
# Optimization to avoid calling predict twice
|
245
|
+
if is_cached(self.cache.predict_query_info):
|
246
|
+
# /query supersets detect entities, so we already have them
|
247
|
+
query_info = self.cache.predict_query_info
|
248
|
+
if query_info is not None and query_info.entities is not None:
|
249
|
+
detected_entities = convert_relations(query_info.entities.model_dump())
|
250
|
+
else:
|
251
|
+
detected_entities = []
|
273
252
|
else:
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
# endpoint instead (as it's faster)
|
278
|
-
detected_entities = await self._predict_detect_entities()
|
253
|
+
# No call to /query has been done, we'll use detect entities
|
254
|
+
# endpoint instead (as it's faster)
|
255
|
+
detected_entities = await self._predict_detect_entities()
|
279
256
|
|
280
|
-
|
281
|
-
|
257
|
+
self.cache.detected_entities = detected_entities
|
258
|
+
return detected_entities
|
282
259
|
|
283
260
|
# Synonyms
|
284
261
|
|
285
262
|
async def get_synonyms(self) -> Optional[knowledgebox_pb2.Synonyms]:
|
286
|
-
|
287
|
-
|
263
|
+
async with self.locks.setdefault("synonyms", asyncio.Lock()):
|
264
|
+
if is_cached(self.cache.synonyms):
|
265
|
+
return self.cache.synonyms
|
288
266
|
|
289
|
-
|
290
|
-
|
291
|
-
|
267
|
+
synonyms = await get_kb_synonyms(self.kbid)
|
268
|
+
self.cache.synonyms = synonyms
|
269
|
+
return synonyms
|
292
270
|
|
293
271
|
# Predict API
|
294
272
|
|
295
273
|
async def _predict_query_endpoint(self) -> Optional[QueryInfo]:
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
)
|
315
|
-
except (SendToPredictError, TimeoutError):
|
316
|
-
query_info = None
|
274
|
+
async with self.locks.setdefault("predict_query_endpoint", asyncio.Lock()):
|
275
|
+
if is_cached(self.cache.predict_query_info):
|
276
|
+
return self.cache.predict_query_info
|
277
|
+
|
278
|
+
# we can't call get_vectorset, as it would do a recirsive loop between
|
279
|
+
# functions, so we'll manually parse it
|
280
|
+
vectorset = await self._get_user_vectorset()
|
281
|
+
try:
|
282
|
+
query_info = await query_information(
|
283
|
+
self.kbid,
|
284
|
+
self.query,
|
285
|
+
vectorset,
|
286
|
+
self.generative_model,
|
287
|
+
self.rephrase,
|
288
|
+
self.rephrase_prompt,
|
289
|
+
)
|
290
|
+
except (SendToPredictError, TimeoutError):
|
291
|
+
query_info = None
|
317
292
|
|
318
|
-
|
319
|
-
|
293
|
+
self.cache.predict_query_info = query_info
|
294
|
+
return query_info
|
320
295
|
|
321
296
|
async def _predict_detect_entities(self) -> list[utils_pb2.RelationNode]:
|
322
|
-
if is_cached(self.cache.predict_detected_entities):
|
323
|
-
return self.cache.predict_detected_entities
|
324
|
-
|
325
297
|
try:
|
326
298
|
detected_entities = await detect_entities(self.kbid, self.query)
|
327
299
|
except (SendToPredictError, TimeoutError) as ex:
|
328
300
|
logger.warning(f"Errors on Predict API detecting entities: {ex}", extra={"kbid": self.kbid})
|
329
301
|
detected_entities = []
|
330
302
|
|
331
|
-
self.cache.predict_detected_entities = detected_entities
|
332
303
|
return detected_entities
|
333
304
|
|
334
305
|
|
@@ -360,7 +331,7 @@ async def detect_entities(kbid: str, query: str) -> list[utils_pb2.RelationNode]
|
|
360
331
|
|
361
332
|
|
362
333
|
@alru_cache(maxsize=None)
|
363
|
-
async def get_matryoshka_dimension_cached(kbid: str, vectorset:
|
334
|
+
async def get_matryoshka_dimension_cached(kbid: str, vectorset: str) -> Optional[int]:
|
364
335
|
# This can be safely cached as the matryoshka dimension is not expected to change
|
365
336
|
return await get_matryoshka_dimension(kbid, vectorset)
|
366
337
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.3.4.
|
3
|
+
Version: 6.3.4.post3836
|
4
4
|
Summary: NucliaDB
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
6
6
|
License: AGPL
|
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
21
21
|
Requires-Python: <4,>=3.9
|
22
22
|
Description-Content-Type: text/markdown
|
23
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.3.4.
|
24
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.
|
25
|
-
Requires-Dist: nucliadb-protos>=6.3.4.
|
26
|
-
Requires-Dist: nucliadb-models>=6.3.4.
|
27
|
-
Requires-Dist: nidx-protos>=6.3.4.
|
23
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.3.4.post3836
|
24
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.post3836
|
25
|
+
Requires-Dist: nucliadb-protos>=6.3.4.post3836
|
26
|
+
Requires-Dist: nucliadb-models>=6.3.4.post3836
|
27
|
+
Requires-Dist: nidx-protos>=6.3.4.post3836
|
28
28
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
29
29
|
Requires-Dist: nuclia-models>=0.24.2
|
30
30
|
Requires-Dist: uvicorn
|
@@ -251,7 +251,7 @@ nucliadb/search/search/chat/prompt.py,sha256=Jnja-Ss7skgnnDY8BymVfdeYsFPnIQFL8tE
|
|
251
251
|
nucliadb/search/search/chat/query.py,sha256=0IoeW-JNaRBe2d9C3bXNfkYpzmsN_IIg3U4Vqb8eOEk,16485
|
252
252
|
nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
253
253
|
nucliadb/search/search/query_parser/exceptions.py,sha256=szAOXUZ27oNY-OSa9t2hQ5HHkQQC0EX1FZz_LluJHJE,1224
|
254
|
-
nucliadb/search/search/query_parser/fetcher.py,sha256=
|
254
|
+
nucliadb/search/search/query_parser/fetcher.py,sha256=vrPgPnebWlrbx9fcelw7eHayEDptxnm9BP1F9LAEMws,15307
|
255
255
|
nucliadb/search/search/query_parser/filter_expression.py,sha256=rws5vsKTofX2iMUK4yvjmLZFxtcbWbyhIcwen4j0rQg,6578
|
256
256
|
nucliadb/search/search/query_parser/models.py,sha256=7czH-jHskl9axEnZV5XnQY8cyN_fs14Cpivzd5aaSxc,2686
|
257
257
|
nucliadb/search/search/query_parser/old_filters.py,sha256=-zbfN-RsXoj_DRjh3Lfp-wShwFXgkISawzVptVzja-A,9071
|
@@ -354,8 +354,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
354
354
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
355
355
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
356
356
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
357
|
-
nucliadb-6.3.4.
|
358
|
-
nucliadb-6.3.4.
|
359
|
-
nucliadb-6.3.4.
|
360
|
-
nucliadb-6.3.4.
|
361
|
-
nucliadb-6.3.4.
|
357
|
+
nucliadb-6.3.4.post3836.dist-info/METADATA,sha256=NFXVJeToiGNhr2_UHrIDPOY6ddlhdruKl9WZdb72eKs,4291
|
358
|
+
nucliadb-6.3.4.post3836.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
359
|
+
nucliadb-6.3.4.post3836.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
360
|
+
nucliadb-6.3.4.post3836.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
361
|
+
nucliadb-6.3.4.post3836.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|