nucliadb 6.3.4.post3821__py3-none-any.whl → 6.3.4.post3836__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ import asyncio
20
21
  from typing import Optional, TypeVar, Union
21
22
 
22
23
  from async_lru import alru_cache
@@ -53,12 +54,9 @@ def is_cached(field: Union[T, NotCached]) -> TypeIs[T]:
53
54
 
54
55
  class FetcherCache:
55
56
  predict_query_info: Union[Optional[QueryInfo], NotCached] = not_cached
56
- predict_detected_entities: Union[list[utils_pb2.RelationNode], NotCached] = not_cached
57
57
 
58
58
  # semantic search
59
- query_vector: Union[Optional[list[float]], NotCached] = not_cached
60
59
  vectorset: Union[str, NotCached] = not_cached
61
- matryoshka_dimension: Union[Optional[int], NotCached] = not_cached
62
60
 
63
61
  labels: Union[knowledgebox_pb2.Labels, NotCached] = not_cached
64
62
 
@@ -97,103 +95,80 @@ class Fetcher:
97
95
  self.query = query
98
96
  self.user_vector = user_vector
99
97
  self.user_vectorset = vectorset
98
+ self.user_vectorset_validated = False
100
99
  self.rephrase = rephrase
101
100
  self.rephrase_prompt = rephrase_prompt
102
101
  self.generative_model = generative_model
103
102
 
104
103
  self.cache = FetcherCache()
105
- self._validated = False
106
-
107
- # Validation
108
-
109
- async def initial_validate(self):
110
- """Runs a validation on the input parameters. It can raise errors if
111
- there's some wrong parameter.
112
-
113
- This function should be always called if validated input for fetching is
114
- desired
115
- """
116
- if self._validated:
117
- return
118
-
119
- self._validated = True
120
-
121
- async def _validate_vectorset(self):
122
- if self.user_vectorset is not None:
123
- await validate_vectorset(self.kbid, self.user_vectorset)
104
+ self.locks: dict[str, asyncio.Lock] = {}
124
105
 
125
106
  # Semantic search
126
107
 
127
108
  async def get_matryoshka_dimension(self) -> Optional[int]:
128
- if is_cached(self.cache.matryoshka_dimension):
129
- return self.cache.matryoshka_dimension
130
-
131
109
  vectorset = await self.get_vectorset()
132
- matryoshka_dimension = await get_matryoshka_dimension_cached(self.kbid, vectorset)
133
- self.cache.matryoshka_dimension = matryoshka_dimension
134
- return matryoshka_dimension
110
+ return await get_matryoshka_dimension_cached(self.kbid, vectorset)
135
111
 
136
112
  async def _get_user_vectorset(self) -> Optional[str]:
137
113
  """Returns the user's requested vectorset and validates if it does exist
138
114
  in the KB.
139
115
 
140
116
  """
141
- vectorset = self.user_vectorset
142
- if not self._validated:
143
- await self._validate_vectorset()
144
- return vectorset
117
+ async with self.locks.setdefault("user_vectorset", asyncio.Lock()):
118
+ if not self.user_vectorset_validated:
119
+ if self.user_vectorset is not None:
120
+ await validate_vectorset(self.kbid, self.user_vectorset)
121
+ self.user_vectorset_validated = True
122
+ return self.user_vectorset
145
123
 
146
124
  async def get_vectorset(self) -> str:
147
125
  """Get the vectorset to be used in the search. If not specified, by the
148
126
  user, Predict API or the own uses KB will provide a default.
149
127
 
150
128
  """
129
+ async with self.locks.setdefault("vectorset", asyncio.Lock()):
130
+ if is_cached(self.cache.vectorset):
131
+ return self.cache.vectorset
151
132
 
152
- if is_cached(self.cache.vectorset):
153
- return self.cache.vectorset
154
-
155
- if self.user_vectorset:
156
- # user explicitly asked for a vectorset
157
- self.cache.vectorset = self.user_vectorset
158
- return self.user_vectorset
133
+ user_vectorset = await self._get_user_vectorset()
134
+ if user_vectorset:
135
+ # user explicitly asked for a vectorset
136
+ self.cache.vectorset = user_vectorset
137
+ return user_vectorset
159
138
 
160
- # when it's not provided, we get the default from Predict API
161
- query_info = await self._predict_query_endpoint()
162
- if query_info is None:
163
- vectorset = None
164
- else:
165
- if query_info.sentence is None:
166
- logger.error(
167
- "Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
168
- )
139
+ # when it's not provided, we get the default from Predict API
140
+ query_info = await self._predict_query_endpoint()
141
+ if query_info is None:
169
142
  vectorset = None
170
143
  else:
171
- # vectors field is enforced by the data model to have at least one key
172
- for vectorset in query_info.sentence.vectors.keys():
173
- vectorset = vectorset
174
- break
175
-
176
- if vectorset is None:
177
- # in case predict don't answer which vectorset to use, fallback to
178
- # the first vectorset of the KB
179
- async with datamanagers.with_ro_transaction() as txn:
180
- async for vectorset, _ in datamanagers.vectorsets.iter(txn, kbid=self.kbid):
181
- break
182
- assert vectorset is not None, "All KBs must have at least one vectorset in maindb"
183
-
184
- self.cache.vectorset = vectorset
185
- return vectorset
144
+ if query_info.sentence is None:
145
+ logger.error(
146
+ "Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
147
+ )
148
+ vectorset = None
149
+ else:
150
+ # vectors field is enforced by the data model to have at least one key
151
+ for vectorset in query_info.sentence.vectors.keys():
152
+ vectorset = vectorset
153
+ break
154
+
155
+ if vectorset is None:
156
+ # in case predict don't answer which vectorset to use, fallback to
157
+ # the first vectorset of the KB
158
+ async with datamanagers.with_ro_transaction() as txn:
159
+ async for vectorset, _ in datamanagers.vectorsets.iter(txn, kbid=self.kbid):
160
+ break
161
+ assert vectorset is not None, "All KBs must have at least one vectorset in maindb"
162
+
163
+ self.cache.vectorset = vectorset
164
+ return vectorset
186
165
 
187
166
  async def get_query_vector(self) -> Optional[list[float]]:
188
- if is_cached(self.cache.query_vector):
189
- return self.cache.query_vector
190
-
191
167
  if self.user_vector is not None:
192
168
  query_vector = self.user_vector
193
169
  else:
194
170
  query_info = await self._predict_query_endpoint()
195
171
  if query_info is None or query_info.sentence is None:
196
- self.cache.query_vector = None
197
172
  return None
198
173
 
199
174
  vectorset = await self.get_vectorset()
@@ -206,7 +181,6 @@ class Fetcher:
206
181
  "predict_vectorsets": ",".join(query_info.sentence.vectors.keys()),
207
182
  },
208
183
  )
209
- self.cache.query_vector = None
210
184
  return None
211
185
 
212
186
  query_vector = query_info.sentence.vectors[vectorset]
@@ -223,7 +197,6 @@ class Fetcher:
223
197
  # accordingly
224
198
  query_vector = query_vector[:matryoshka_dimension]
225
199
 
226
- self.cache.query_vector = query_vector
227
200
  return query_vector
228
201
 
229
202
  async def get_rephrased_query(self) -> Optional[str]:
@@ -235,100 +208,98 @@ class Fetcher:
235
208
  # Labels
236
209
 
237
210
  async def get_classification_labels(self) -> knowledgebox_pb2.Labels:
238
- if is_cached(self.cache.labels):
239
- return self.cache.labels
211
+ async with self.locks.setdefault("classification_labels", asyncio.Lock()):
212
+ if is_cached(self.cache.labels):
213
+ return self.cache.labels
240
214
 
241
- labels = await get_classification_labels(self.kbid)
242
- self.cache.labels = labels
243
- return labels
215
+ labels = await get_classification_labels(self.kbid)
216
+ self.cache.labels = labels
217
+ return labels
244
218
 
245
219
  # Entities
246
220
 
247
221
  async def get_entities_meta_cache(self) -> datamanagers.entities.EntitiesMetaCache:
248
- if is_cached(self.cache.entities_meta_cache):
249
- return self.cache.entities_meta_cache
222
+ async with self.locks.setdefault("entities_meta_cache", asyncio.Lock()):
223
+ if is_cached(self.cache.entities_meta_cache):
224
+ return self.cache.entities_meta_cache
250
225
 
251
- entities_meta_cache = await get_entities_meta_cache(self.kbid)
252
- self.cache.entities_meta_cache = entities_meta_cache
253
- return entities_meta_cache
226
+ entities_meta_cache = await get_entities_meta_cache(self.kbid)
227
+ self.cache.entities_meta_cache = entities_meta_cache
228
+ return entities_meta_cache
254
229
 
255
230
  async def get_deleted_entity_groups(self) -> list[str]:
256
- if is_cached(self.cache.deleted_entity_groups):
257
- return self.cache.deleted_entity_groups
231
+ async with self.locks.setdefault("deleted_entity_groups", asyncio.Lock()):
232
+ if is_cached(self.cache.deleted_entity_groups):
233
+ return self.cache.deleted_entity_groups
258
234
 
259
- deleted_entity_groups = await get_deleted_entity_groups(self.kbid)
260
- self.cache.deleted_entity_groups = deleted_entity_groups
261
- return deleted_entity_groups
235
+ deleted_entity_groups = await get_deleted_entity_groups(self.kbid)
236
+ self.cache.deleted_entity_groups = deleted_entity_groups
237
+ return deleted_entity_groups
262
238
 
263
239
  async def get_detected_entities(self) -> list[utils_pb2.RelationNode]:
264
- if is_cached(self.cache.detected_entities):
265
- return self.cache.detected_entities
266
-
267
- # Optimization to avoid calling predict twice
268
- if is_cached(self.cache.predict_query_info):
269
- # /query supersets detect entities, so we already have them
270
- query_info = self.cache.predict_query_info
271
- if query_info is not None and query_info.entities is not None:
272
- detected_entities = convert_relations(query_info.entities.model_dump())
240
+ async with self.locks.setdefault("detected_entities", asyncio.Lock()):
241
+ if is_cached(self.cache.detected_entities):
242
+ return self.cache.detected_entities
243
+
244
+ # Optimization to avoid calling predict twice
245
+ if is_cached(self.cache.predict_query_info):
246
+ # /query supersets detect entities, so we already have them
247
+ query_info = self.cache.predict_query_info
248
+ if query_info is not None and query_info.entities is not None:
249
+ detected_entities = convert_relations(query_info.entities.model_dump())
250
+ else:
251
+ detected_entities = []
273
252
  else:
274
- detected_entities = []
275
- else:
276
- # No call to /query has been done, we'll use detect entities
277
- # endpoint instead (as it's faster)
278
- detected_entities = await self._predict_detect_entities()
253
+ # No call to /query has been done, we'll use detect entities
254
+ # endpoint instead (as it's faster)
255
+ detected_entities = await self._predict_detect_entities()
279
256
 
280
- self.cache.detected_entities = detected_entities
281
- return detected_entities
257
+ self.cache.detected_entities = detected_entities
258
+ return detected_entities
282
259
 
283
260
  # Synonyms
284
261
 
285
262
  async def get_synonyms(self) -> Optional[knowledgebox_pb2.Synonyms]:
286
- if is_cached(self.cache.synonyms):
287
- return self.cache.synonyms
263
+ async with self.locks.setdefault("synonyms", asyncio.Lock()):
264
+ if is_cached(self.cache.synonyms):
265
+ return self.cache.synonyms
288
266
 
289
- synonyms = await get_kb_synonyms(self.kbid)
290
- self.cache.synonyms = synonyms
291
- return synonyms
267
+ synonyms = await get_kb_synonyms(self.kbid)
268
+ self.cache.synonyms = synonyms
269
+ return synonyms
292
270
 
293
271
  # Predict API
294
272
 
295
273
  async def _predict_query_endpoint(self) -> Optional[QueryInfo]:
296
- if is_cached(self.cache.predict_query_info):
297
- return self.cache.predict_query_info
298
-
299
- # calling twice should be avoided as query endpoint is a superset of detect entities
300
- if is_cached(self.cache.predict_detected_entities):
301
- logger.warning("Fetcher is not being efficient enough and has called predict twice!")
302
-
303
- # we can't call get_vectorset, as it would do a recirsive loop between
304
- # functions, so we'll manually parse it
305
- vectorset = await self._get_user_vectorset()
306
- try:
307
- query_info = await query_information(
308
- self.kbid,
309
- self.query,
310
- vectorset,
311
- self.generative_model,
312
- self.rephrase,
313
- self.rephrase_prompt,
314
- )
315
- except (SendToPredictError, TimeoutError):
316
- query_info = None
274
+ async with self.locks.setdefault("predict_query_endpoint", asyncio.Lock()):
275
+ if is_cached(self.cache.predict_query_info):
276
+ return self.cache.predict_query_info
277
+
278
+ # we can't call get_vectorset, as it would do a recirsive loop between
279
+ # functions, so we'll manually parse it
280
+ vectorset = await self._get_user_vectorset()
281
+ try:
282
+ query_info = await query_information(
283
+ self.kbid,
284
+ self.query,
285
+ vectorset,
286
+ self.generative_model,
287
+ self.rephrase,
288
+ self.rephrase_prompt,
289
+ )
290
+ except (SendToPredictError, TimeoutError):
291
+ query_info = None
317
292
 
318
- self.cache.predict_query_info = query_info
319
- return query_info
293
+ self.cache.predict_query_info = query_info
294
+ return query_info
320
295
 
321
296
  async def _predict_detect_entities(self) -> list[utils_pb2.RelationNode]:
322
- if is_cached(self.cache.predict_detected_entities):
323
- return self.cache.predict_detected_entities
324
-
325
297
  try:
326
298
  detected_entities = await detect_entities(self.kbid, self.query)
327
299
  except (SendToPredictError, TimeoutError) as ex:
328
300
  logger.warning(f"Errors on Predict API detecting entities: {ex}", extra={"kbid": self.kbid})
329
301
  detected_entities = []
330
302
 
331
- self.cache.predict_detected_entities = detected_entities
332
303
  return detected_entities
333
304
 
334
305
 
@@ -360,7 +331,7 @@ async def detect_entities(kbid: str, query: str) -> list[utils_pb2.RelationNode]
360
331
 
361
332
 
362
333
  @alru_cache(maxsize=None)
363
- async def get_matryoshka_dimension_cached(kbid: str, vectorset: Optional[str]) -> Optional[int]:
334
+ async def get_matryoshka_dimension_cached(kbid: str, vectorset: str) -> Optional[int]:
364
335
  # This can be safely cached as the matryoshka dimension is not expected to change
365
336
  return await get_matryoshka_dimension(kbid, vectorset)
366
337
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.3.4.post3821
3
+ Version: 6.3.4.post3836
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: <4,>=3.9
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: nucliadb-telemetry[all]>=6.3.4.post3821
24
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.post3821
25
- Requires-Dist: nucliadb-protos>=6.3.4.post3821
26
- Requires-Dist: nucliadb-models>=6.3.4.post3821
27
- Requires-Dist: nidx-protos>=6.3.4.post3821
23
+ Requires-Dist: nucliadb-telemetry[all]>=6.3.4.post3836
24
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.post3836
25
+ Requires-Dist: nucliadb-protos>=6.3.4.post3836
26
+ Requires-Dist: nucliadb-models>=6.3.4.post3836
27
+ Requires-Dist: nidx-protos>=6.3.4.post3836
28
28
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
29
29
  Requires-Dist: nuclia-models>=0.24.2
30
30
  Requires-Dist: uvicorn
@@ -251,7 +251,7 @@ nucliadb/search/search/chat/prompt.py,sha256=Jnja-Ss7skgnnDY8BymVfdeYsFPnIQFL8tE
251
251
  nucliadb/search/search/chat/query.py,sha256=0IoeW-JNaRBe2d9C3bXNfkYpzmsN_IIg3U4Vqb8eOEk,16485
252
252
  nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
253
253
  nucliadb/search/search/query_parser/exceptions.py,sha256=szAOXUZ27oNY-OSa9t2hQ5HHkQQC0EX1FZz_LluJHJE,1224
254
- nucliadb/search/search/query_parser/fetcher.py,sha256=jhr__J0KmAzjdsTTadWQmD9qf6lZvqlKAfZdYjZH_UY,15742
254
+ nucliadb/search/search/query_parser/fetcher.py,sha256=vrPgPnebWlrbx9fcelw7eHayEDptxnm9BP1F9LAEMws,15307
255
255
  nucliadb/search/search/query_parser/filter_expression.py,sha256=rws5vsKTofX2iMUK4yvjmLZFxtcbWbyhIcwen4j0rQg,6578
256
256
  nucliadb/search/search/query_parser/models.py,sha256=7czH-jHskl9axEnZV5XnQY8cyN_fs14Cpivzd5aaSxc,2686
257
257
  nucliadb/search/search/query_parser/old_filters.py,sha256=-zbfN-RsXoj_DRjh3Lfp-wShwFXgkISawzVptVzja-A,9071
@@ -354,8 +354,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
354
354
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
355
355
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
356
356
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
357
- nucliadb-6.3.4.post3821.dist-info/METADATA,sha256=I_IqkABVw9DZGcLJyK5KVtV7Tr4d8x1Ut3GX-Oncydo,4291
358
- nucliadb-6.3.4.post3821.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
359
- nucliadb-6.3.4.post3821.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
360
- nucliadb-6.3.4.post3821.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
361
- nucliadb-6.3.4.post3821.dist-info/RECORD,,
357
+ nucliadb-6.3.4.post3836.dist-info/METADATA,sha256=NFXVJeToiGNhr2_UHrIDPOY6ddlhdruKl9WZdb72eKs,4291
358
+ nucliadb-6.3.4.post3836.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
359
+ nucliadb-6.3.4.post3836.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
360
+ nucliadb-6.3.4.post3836.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
361
+ nucliadb-6.3.4.post3836.dist-info/RECORD,,