PyPI - nucliadb - Versions diffs - 6.3.4.post3821__py3-none-any.whl → 6.3.4.post3836__py3-none-any.whl - Mend

nucliadb 6.3.4.post3821py3-none-any.whl → 6.3.4.post3836py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

nucliadb/search/search/query_parser/fetcher.py CHANGED Viewed

@@ -17,6 +17,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
+import asyncio
 from typing import Optional, TypeVar, Union
 from async_lru import alru_cache
@@ -53,12 +54,9 @@ def is_cached(field: Union[T, NotCached]) -> TypeIs[T]:
 class FetcherCache:
     predict_query_info: Union[Optional[QueryInfo], NotCached] = not_cached
-    predict_detected_entities: Union[list[utils_pb2.RelationNode], NotCached] = not_cached
     # semantic search
-    query_vector: Union[Optional[list[float]], NotCached] = not_cached
     vectorset: Union[str, NotCached] = not_cached
-    matryoshka_dimension: Union[Optional[int], NotCached] = not_cached
     labels: Union[knowledgebox_pb2.Labels, NotCached] = not_cached
@@ -97,103 +95,80 @@ class Fetcher:
         self.query = query
         self.user_vector = user_vector
         self.user_vectorset = vectorset
+        self.user_vectorset_validated = False
         self.rephrase = rephrase
         self.rephrase_prompt = rephrase_prompt
         self.generative_model = generative_model
         self.cache = FetcherCache()
-        self._validated = False
-    # Validation
-    async def initial_validate(self):
-        """Runs a validation on the input parameters. It can raise errors if
-        there's some wrong parameter.
-        This function should be always called if validated input for fetching is
-        desired
-        """
-        if self._validated:
-            return
-        self._validated = True
-    async def _validate_vectorset(self):
-        if self.user_vectorset is not None:
-            await validate_vectorset(self.kbid, self.user_vectorset)
+        self.locks: dict[str, asyncio.Lock] = {}
     # Semantic search
     async def get_matryoshka_dimension(self) -> Optional[int]:
-        if is_cached(self.cache.matryoshka_dimension):
-            return self.cache.matryoshka_dimension
         vectorset = await self.get_vectorset()
-        matryoshka_dimension = await get_matryoshka_dimension_cached(self.kbid, vectorset)
-        self.cache.matryoshka_dimension = matryoshka_dimension
-        return matryoshka_dimension
+        return await get_matryoshka_dimension_cached(self.kbid, vectorset)
     async def _get_user_vectorset(self) -> Optional[str]:
         """Returns the user's requested vectorset and validates if it does exist
         in the KB.
         """
-        vectorset = self.user_vectorset
-        if not self._validated:
-            await self._validate_vectorset()
-        return vectorset
+        async with self.locks.setdefault("user_vectorset", asyncio.Lock()):
+            if not self.user_vectorset_validated:
+                if self.user_vectorset is not None:
+                    await validate_vectorset(self.kbid, self.user_vectorset)
+            self.user_vectorset_validated = True
+            return self.user_vectorset
     async def get_vectorset(self) -> str:
         """Get the vectorset to be used in the search. If not specified, by the
         user, Predict API or the own uses KB will provide a default.
         """
+        async with self.locks.setdefault("vectorset", asyncio.Lock()):
+            if is_cached(self.cache.vectorset):
+                return self.cache.vectorset
-        if is_cached(self.cache.vectorset):
-            return self.cache.vectorset
-        if self.user_vectorset:
-            # user explicitly asked for a vectorset
-            self.cache.vectorset = self.user_vectorset
-            return self.user_vectorset
+            user_vectorset = await self._get_user_vectorset()
+            if user_vectorset:
+                # user explicitly asked for a vectorset
+                self.cache.vectorset = user_vectorset
+                return user_vectorset
-        # when it's not provided, we get the default from Predict API
-        query_info = await self._predict_query_endpoint()
-        if query_info is None:
-            vectorset = None
-        else:
-            if query_info.sentence is None:
-                logger.error(
-                    "Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
-                )
+            # when it's not provided, we get the default from Predict API
+            query_info = await self._predict_query_endpoint()
+            if query_info is None:
                 vectorset = None
             else:
-                # vectors field is enforced by the data model to have at least one key
-                for vectorset in query_info.sentence.vectors.keys():
-                    vectorset = vectorset
-                    break
-        if vectorset is None:
-            # in case predict don't answer which vectorset to use, fallback to
-            # the first vectorset of the KB
-            async with datamanagers.with_ro_transaction() as txn:
-                async for vectorset, _ in datamanagers.vectorsets.iter(txn, kbid=self.kbid):
-                    break
-            assert vectorset is not None, "All KBs must have at least one vectorset in maindb"
-        self.cache.vectorset = vectorset
-        return vectorset
+                if query_info.sentence is None:
+                    logger.error(
+                        "Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
+                    )
+                    vectorset = None
+                else:
+                    # vectors field is enforced by the data model to have at least one key
+                    for vectorset in query_info.sentence.vectors.keys():
+                        vectorset = vectorset
+                        break
+            if vectorset is None:
+                # in case predict don't answer which vectorset to use, fallback to
+                # the first vectorset of the KB
+                async with datamanagers.with_ro_transaction() as txn:
+                    async for vectorset, _ in datamanagers.vectorsets.iter(txn, kbid=self.kbid):
+                        break
+                assert vectorset is not None, "All KBs must have at least one vectorset in maindb"
+            self.cache.vectorset = vectorset
+            return vectorset
     async def get_query_vector(self) -> Optional[list[float]]:
-        if is_cached(self.cache.query_vector):
-            return self.cache.query_vector
         if self.user_vector is not None:
             query_vector = self.user_vector
         else:
             query_info = await self._predict_query_endpoint()
             if query_info is None or query_info.sentence is None:
-                self.cache.query_vector = None
                 return None
             vectorset = await self.get_vectorset()
@@ -206,7 +181,6 @@ class Fetcher:
                         "predict_vectorsets": ",".join(query_info.sentence.vectors.keys()),
                     },
                 )
-                self.cache.query_vector = None
                 return None
             query_vector = query_info.sentence.vectors[vectorset]
@@ -223,7 +197,6 @@ class Fetcher:
             # accordingly
             query_vector = query_vector[:matryoshka_dimension]
-        self.cache.query_vector = query_vector
         return query_vector
     async def get_rephrased_query(self) -> Optional[str]:
@@ -235,100 +208,98 @@ class Fetcher:
     # Labels
     async def get_classification_labels(self) -> knowledgebox_pb2.Labels:
-        if is_cached(self.cache.labels):
-            return self.cache.labels
+        async with self.locks.setdefault("classification_labels", asyncio.Lock()):
+            if is_cached(self.cache.labels):
+                return self.cache.labels
-        labels = await get_classification_labels(self.kbid)
-        self.cache.labels = labels
-        return labels
+            labels = await get_classification_labels(self.kbid)
+            self.cache.labels = labels
+            return labels
     # Entities
     async def get_entities_meta_cache(self) -> datamanagers.entities.EntitiesMetaCache:
-        if is_cached(self.cache.entities_meta_cache):
-            return self.cache.entities_meta_cache
+        async with self.locks.setdefault("entities_meta_cache", asyncio.Lock()):
+            if is_cached(self.cache.entities_meta_cache):
+                return self.cache.entities_meta_cache
-        entities_meta_cache = await get_entities_meta_cache(self.kbid)
-        self.cache.entities_meta_cache = entities_meta_cache
-        return entities_meta_cache
+            entities_meta_cache = await get_entities_meta_cache(self.kbid)
+            self.cache.entities_meta_cache = entities_meta_cache
+            return entities_meta_cache
     async def get_deleted_entity_groups(self) -> list[str]:
-        if is_cached(self.cache.deleted_entity_groups):
-            return self.cache.deleted_entity_groups
+        async with self.locks.setdefault("deleted_entity_groups", asyncio.Lock()):
+            if is_cached(self.cache.deleted_entity_groups):
+                return self.cache.deleted_entity_groups
-        deleted_entity_groups = await get_deleted_entity_groups(self.kbid)
-        self.cache.deleted_entity_groups = deleted_entity_groups
-        return deleted_entity_groups
+            deleted_entity_groups = await get_deleted_entity_groups(self.kbid)
+            self.cache.deleted_entity_groups = deleted_entity_groups
+            return deleted_entity_groups
     async def get_detected_entities(self) -> list[utils_pb2.RelationNode]:
-        if is_cached(self.cache.detected_entities):
-            return self.cache.detected_entities
-        # Optimization to avoid calling predict twice
-        if is_cached(self.cache.predict_query_info):
-            # /query supersets detect entities, so we already have them
-            query_info = self.cache.predict_query_info
-            if query_info is not None and query_info.entities is not None:
-                detected_entities = convert_relations(query_info.entities.model_dump())
+        async with self.locks.setdefault("detected_entities", asyncio.Lock()):
+            if is_cached(self.cache.detected_entities):
+                return self.cache.detected_entities
+            # Optimization to avoid calling predict twice
+            if is_cached(self.cache.predict_query_info):
+                # /query supersets detect entities, so we already have them
+                query_info = self.cache.predict_query_info
+                if query_info is not None and query_info.entities is not None:
+                    detected_entities = convert_relations(query_info.entities.model_dump())
+                else:
+                    detected_entities = []
             else:
-                detected_entities = []
-        else:
-            # No call to /query has been done, we'll use detect entities
-            # endpoint instead (as it's faster)
-            detected_entities = await self._predict_detect_entities()
+                # No call to /query has been done, we'll use detect entities
+                # endpoint instead (as it's faster)
+                detected_entities = await self._predict_detect_entities()
-        self.cache.detected_entities = detected_entities
-        return detected_entities
+            self.cache.detected_entities = detected_entities
+            return detected_entities
     # Synonyms
     async def get_synonyms(self) -> Optional[knowledgebox_pb2.Synonyms]:
-        if is_cached(self.cache.synonyms):
-            return self.cache.synonyms
+        async with self.locks.setdefault("synonyms", asyncio.Lock()):
+            if is_cached(self.cache.synonyms):
+                return self.cache.synonyms
-        synonyms = await get_kb_synonyms(self.kbid)
-        self.cache.synonyms = synonyms
-        return synonyms
+            synonyms = await get_kb_synonyms(self.kbid)
+            self.cache.synonyms = synonyms
+            return synonyms
     # Predict API
     async def _predict_query_endpoint(self) -> Optional[QueryInfo]:
-        if is_cached(self.cache.predict_query_info):
-            return self.cache.predict_query_info
-        # calling twice should be avoided as query endpoint is a superset of detect entities
-        if is_cached(self.cache.predict_detected_entities):
-            logger.warning("Fetcher is not being efficient enough and has called predict twice!")
-        # we can't call get_vectorset, as it would do a recirsive loop between
-        # functions, so we'll manually parse it
-        vectorset = await self._get_user_vectorset()
-        try:
-            query_info = await query_information(
-                self.kbid,
-                self.query,
-                vectorset,
-                self.generative_model,
-                self.rephrase,
-                self.rephrase_prompt,
-            )
-        except (SendToPredictError, TimeoutError):
-            query_info = None
+        async with self.locks.setdefault("predict_query_endpoint", asyncio.Lock()):
+            if is_cached(self.cache.predict_query_info):
+                return self.cache.predict_query_info
+            # we can't call get_vectorset, as it would do a recirsive loop between
+            # functions, so we'll manually parse it
+            vectorset = await self._get_user_vectorset()
+            try:
+                query_info = await query_information(
+                    self.kbid,
+                    self.query,
+                    vectorset,
+                    self.generative_model,
+                    self.rephrase,
+                    self.rephrase_prompt,
+                )
+            except (SendToPredictError, TimeoutError):
+                query_info = None
-        self.cache.predict_query_info = query_info
-        return query_info
+            self.cache.predict_query_info = query_info
+            return query_info
     async def _predict_detect_entities(self) -> list[utils_pb2.RelationNode]:
-        if is_cached(self.cache.predict_detected_entities):
-            return self.cache.predict_detected_entities
         try:
             detected_entities = await detect_entities(self.kbid, self.query)
         except (SendToPredictError, TimeoutError) as ex:
             logger.warning(f"Errors on Predict API detecting entities: {ex}", extra={"kbid": self.kbid})
             detected_entities = []
-        self.cache.predict_detected_entities = detected_entities
         return detected_entities
@@ -360,7 +331,7 @@ async def detect_entities(kbid: str, query: str) -> list[utils_pb2.RelationNode]
 @alru_cache(maxsize=None)
-async def get_matryoshka_dimension_cached(kbid: str, vectorset: Optional[str]) -> Optional[int]:
+async def get_matryoshka_dimension_cached(kbid: str, vectorset: str) -> Optional[int]:
     # This can be safely cached as the matryoshka dimension is not expected to change
     return await get_matryoshka_dimension(kbid, vectorset)

{nucliadb-6.3.4.post3821.dist-info → nucliadb-6.3.4.post3836.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nucliadb
-Version: 6.3.4.post3821
+Version: 6.3.4.post3836
 Summary: NucliaDB
 Author-email: Nuclia <nucliadb@nuclia.com>
 License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3 :: Only
 Requires-Python: <4,>=3.9
 Description-Content-Type: text/markdown
-Requires-Dist: nucliadb-telemetry[all]>=6.3.4.post3821
-Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.post3821
-Requires-Dist: nucliadb-protos>=6.3.4.post3821
-Requires-Dist: nucliadb-models>=6.3.4.post3821
-Requires-Dist: nidx-protos>=6.3.4.post3821
+Requires-Dist: nucliadb-telemetry[all]>=6.3.4.post3836
+Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.post3836
+Requires-Dist: nucliadb-protos>=6.3.4.post3836
+Requires-Dist: nucliadb-models>=6.3.4.post3836
+Requires-Dist: nidx-protos>=6.3.4.post3836
 Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
 Requires-Dist: nuclia-models>=0.24.2
 Requires-Dist: uvicorn

{nucliadb-6.3.4.post3821.dist-info → nucliadb-6.3.4.post3836.dist-info}/RECORD RENAMED Viewed

@@ -251,7 +251,7 @@ nucliadb/search/search/chat/prompt.py,sha256=Jnja-Ss7skgnnDY8BymVfdeYsFPnIQFL8tE
 nucliadb/search/search/chat/query.py,sha256=0IoeW-JNaRBe2d9C3bXNfkYpzmsN_IIg3U4Vqb8eOEk,16485
 nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
 nucliadb/search/search/query_parser/exceptions.py,sha256=szAOXUZ27oNY-OSa9t2hQ5HHkQQC0EX1FZz_LluJHJE,1224
-nucliadb/search/search/query_parser/fetcher.py,sha256=jhr__J0KmAzjdsTTadWQmD9qf6lZvqlKAfZdYjZH_UY,15742
+nucliadb/search/search/query_parser/fetcher.py,sha256=vrPgPnebWlrbx9fcelw7eHayEDptxnm9BP1F9LAEMws,15307
 nucliadb/search/search/query_parser/filter_expression.py,sha256=rws5vsKTofX2iMUK4yvjmLZFxtcbWbyhIcwen4j0rQg,6578
 nucliadb/search/search/query_parser/models.py,sha256=7czH-jHskl9axEnZV5XnQY8cyN_fs14Cpivzd5aaSxc,2686
 nucliadb/search/search/query_parser/old_filters.py,sha256=-zbfN-RsXoj_DRjh3Lfp-wShwFXgkISawzVptVzja-A,9071
@@ -354,8 +354,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
 nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
 nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
 nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
-nucliadb-6.3.4.post3821.dist-info/METADATA,sha256=I_IqkABVw9DZGcLJyK5KVtV7Tr4d8x1Ut3GX-Oncydo,4291
-nucliadb-6.3.4.post3821.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-nucliadb-6.3.4.post3821.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
-nucliadb-6.3.4.post3821.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
-nucliadb-6.3.4.post3821.dist-info/RECORD,,
+nucliadb-6.3.4.post3836.dist-info/METADATA,sha256=NFXVJeToiGNhr2_UHrIDPOY6ddlhdruKl9WZdb72eKs,4291
+nucliadb-6.3.4.post3836.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+nucliadb-6.3.4.post3836.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
+nucliadb-6.3.4.post3836.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
+nucliadb-6.3.4.post3836.dist-info/RECORD,,

{nucliadb-6.3.4.post3821.dist-info → nucliadb-6.3.4.post3836.dist-info}/WHEEL RENAMED Viewed

File without changes

{nucliadb-6.3.4.post3821.dist-info → nucliadb-6.3.4.post3836.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nucliadb-6.3.4.post3821.dist-info → nucliadb-6.3.4.post3836.dist-info}/top_level.txt RENAMED Viewed

File without changes

nucliadb 6.3.4.post3821__py3-none-any.whl → 6.3.4.post3836__py3-none-any.whl

nucliadb 6.3.4.post3821py3-none-any.whl → 6.3.4.post3836py3-none-any.whl