PyPI - nucliadb - Versions diffs - 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (246) hide show

migrations/0023_backfill_pg_catalog.py +8 -4
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +8 -4
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +8 -23
nucliadb/common/cluster/rebalance.py +484 -112
nucliadb/common/cluster/rollover.py +36 -9
nucliadb/common/cluster/settings.py +4 -9
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +9 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +5 -34
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +129 -41
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +16 -23
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +82 -58
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +22 -5
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +10 -8
nucliadb/ingest/consumer/service.py +5 -30
nucliadb/ingest/consumer/shard_creator.py +16 -5
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +37 -49
nucliadb/ingest/fields/conversation.py +55 -9
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +89 -57
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +128 -113
nucliadb/ingest/orm/knowledgebox.py +91 -59
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +98 -153
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +82 -71
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +15 -114
nucliadb/ingest/settings.py +36 -15
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +23 -26
nucliadb/metrics_exporter.py +20 -6
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +4 -11
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +37 -9
nucliadb/reader/api/v1/learning_config.py +33 -14
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +3 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +15 -19
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +28 -8
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +33 -19
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -42
nucliadb/search/search/chat/ask.py +131 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +453 -32
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +49 -0
nucliadb/search/search/hydrator/fields.py +217 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +323 -0
nucliadb/search/search/hydrator/resources.py +60 -0
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +24 -7
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +44 -18
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -48
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +5 -6
nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
nucliadb/search/search/query_parser/parsers/common.py +21 -13
nucliadb/search/search/query_parser/parsers/find.py +6 -29
nucliadb/search/search/query_parser/parsers/graph.py +18 -28
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -56
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +6 -7
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +5 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +4 -10
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +15 -14
nucliadb/writer/api/v1/knowledgebox.py +18 -56
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +43 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +5 -7
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +15 -22
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
nucliadb/search/search/hydrator.py +0 -197
nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/backups/tasks.py CHANGED Viewed

@@ -17,7 +17,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Awaitable, Callable
+from collections.abc import Awaitable, Callable
 from nucliadb.backups.const import BackupsNatsConfig
 from nucliadb.backups.create import backup_kb_task
@@ -38,6 +38,7 @@ def creator_consumer() -> NatsTaskConsumer[CreateBackupRequest]:
         callback=backup_kb_task,
         msg_type=CreateBackupRequest,
         max_concurrent_messages=10,
+        max_retries=100,
     )
     return consumer
@@ -64,6 +65,7 @@ def restorer_consumer() -> NatsTaskConsumer[RestoreBackupRequest]:
         callback=restore_kb_task,
         msg_type=RestoreBackupRequest,
         max_concurrent_messages=10,
+        max_retries=100,
     )
     return consumer
@@ -90,6 +92,7 @@ def deleter_consumer() -> NatsTaskConsumer[DeleteBackupRequest]:
         callback=delete_backup_task,
         msg_type=DeleteBackupRequest,
         max_concurrent_messages=2,
+        max_retries=100,
     )
     return consumer

nucliadb/common/back_pressure/cache.py CHANGED Viewed

@@ -21,7 +21,6 @@ import contextlib
 import logging
 import threading
 from datetime import datetime, timezone
-from typing import Optional
 from cachetools import TTLCache
@@ -47,7 +46,7 @@ class BackPressureCache:
         self._cache = TTLCache(maxsize=1024, ttl=5 * 60)
         self._lock = threading.Lock()
-    def get(self, key: str) -> Optional[BackPressureData]:
+    def get(self, key: str) -> BackPressureData | None:
         with self._lock:
             data = self._cache.get(key, None)
             if data is None:
@@ -72,7 +71,7 @@ def cached_back_pressure(cache_key: str):
     Context manager that handles the caching of the try again in time so that
     we don't recompute try again times if we have already applied back pressure.
     """
-    data: Optional[BackPressureData] = _cache.get(cache_key)
+    data: BackPressureData | None = _cache.get(cache_key)
     if data is not None:
         back_pressure_type = data.type
         RATE_LIMITED_REQUESTS_COUNTER.inc({"type": back_pressure_type, "cached": "true"})

nucliadb/common/back_pressure/materializer.py CHANGED Viewed

@@ -20,7 +20,6 @@
 import asyncio
 import logging
 import threading
-from typing import Optional
 from cachetools import TTLCache
 from fastapi import HTTPException
@@ -118,12 +117,6 @@ class BackPressureMaterializer:
                     extra={"kbid": kbid},
                 )
                 return 0
-            if pending > 0:
-                logger.info(
-                    f"Processing returned {pending} pending messages for KB",
-                    extra={"kbid": kbid},
-                )
             self.processing_pending_cache[kbid] = pending
             return pending
@@ -184,7 +177,7 @@ class BackPressureMaterializer:
                 pending=pending,
                 max_wait=settings.max_wait_time,
             )
-            data = BackPressureData(type="indexing", try_after=try_after)
+            data = BackPressureData(type="indexing", try_after=try_after, pending=pending)
             raise BackPressureException(data)
     def check_ingest(self):
@@ -199,7 +192,7 @@ class BackPressureMaterializer:
                 pending=ingest_pending,
                 max_wait=settings.max_wait_time,
             )
-            data = BackPressureData(type="ingest", try_after=try_after)
+            data = BackPressureData(type="ingest", try_after=try_after, pending=ingest_pending)
             raise BackPressureException(data)
     async def check_processing(self, kbid: str):
@@ -215,11 +208,11 @@ class BackPressureMaterializer:
                 pending=kb_pending,
                 max_wait=settings.max_wait_time,
             )
-            data = BackPressureData(type="processing", try_after=try_after)
+            data = BackPressureData(type="processing", try_after=try_after, pending=kb_pending)
             raise BackPressureException(data)
-MATERIALIZER: Optional[BackPressureMaterializer] = None
+MATERIALIZER: BackPressureMaterializer | None = None
 materializer_lock = threading.Lock()
@@ -268,7 +261,7 @@ def get_materializer() -> BackPressureMaterializer:
     return MATERIALIZER
-async def maybe_back_pressure(kbid: str, resource_uuid: Optional[str] = None) -> None:
+async def maybe_back_pressure(kbid: str, resource_uuid: str | None = None) -> None:
     """
     This function does system checks to see if we need to put back pressure on writes.
     In that case, a HTTP 429 will be raised with the estimated time to try again.
@@ -278,7 +271,7 @@ async def maybe_back_pressure(kbid: str, resource_uuid: Optional[str] = None) ->
     await back_pressure_checks(kbid, resource_uuid)
-async def back_pressure_checks(kbid: str, resource_uuid: Optional[str] = None):
+async def back_pressure_checks(kbid: str, resource_uuid: str | None = None):
     """
     Will raise a 429 if back pressure is needed:
     - If the processing engine is behind.
@@ -299,6 +292,7 @@ async def back_pressure_checks(kbid: str, resource_uuid: Optional[str] = None):
                 "resource_uuid": resource_uuid,
                 "try_after": exc.data.try_after,
                 "back_pressure_type": exc.data.type,
+                "pending": exc.data.pending,
             },
         )
         raise HTTPException(

nucliadb/common/back_pressure/settings.py CHANGED Viewed

@@ -29,30 +29,30 @@ class BackPressureSettings(BaseSettings):
     )
     indexing_rate: float = Field(
         default=10,
-        description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time",  # noqa
+        description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time",
     )
     ingest_rate: float = Field(
         default=4,
-        description="Estimation of the ingest processed consumer rate in messages per second. This is used to calculate the try again in time",  # noqa
+        description="Estimation of the ingest processed consumer rate in messages per second. This is used to calculate the try again in time",
     )
     processing_rate: float = Field(
         default=1,
-        description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time",  # noqa
+        description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time",
     )
     max_indexing_pending: int = Field(
         default=1000,
-        description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks",  # noqa
+        description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks",
         alias="back_pressure_max_indexing_pending",
     )
     max_ingest_pending: int = Field(
         # Disabled by default
         default=0,
-        description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks",  # noqa
+        description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks",
         alias="back_pressure_max_ingest_pending",
     )
     max_processing_pending: int = Field(
         default=1000,
-        description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks",  # noqa
+        description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks",
         alias="back_pressure_max_processing_pending",
     )
     indexing_check_interval: int = Field(

nucliadb/common/back_pressure/utils.py CHANGED Viewed

@@ -28,6 +28,7 @@ from nucliadb_utils.nats import NatsConnectionManager
 class BackPressureData:
     type: str
     try_after: datetime
+    pending: int = 0
 class BackPressureException(Exception):

nucliadb/common/cache.py CHANGED Viewed

@@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
 from contextvars import ContextVar
 from dataclasses import dataclass
 from functools import cached_property
-from typing import Generic, Optional, TypeVar
+from typing import Generic, TypeVar
 import backoff
 from async_lru import _LRUCacheWrapper, alru_cache
@@ -66,9 +66,9 @@ class Cache(Generic[K, T], ABC):
     """
-    cache: _LRUCacheWrapper[Optional[T]]
+    cache: _LRUCacheWrapper[T | None]
-    async def get(self, *args: K.args, **kwargs: K.kwargs) -> Optional[T]:
+    async def get(self, *args: K.args, **kwargs: K.kwargs) -> T | None:
         result = await self.cache(*args)
         # Do not cache None
         if result is None:
@@ -88,7 +88,7 @@ class Cache(Generic[K, T], ABC):
 class ResourceCache(Cache[[str, str], ResourceORM]):
     def __init__(self, cache_size: int) -> None:
         @alru_cache(maxsize=cache_size)
-        async def _get_resource(kbid: str, rid: str) -> Optional[ResourceORM]:
+        async def _get_resource(kbid: str, rid: str) -> ResourceORM | None:
             storage = await get_storage()
             async with get_driver().ro_transaction() as txn:
                 kb = KnowledgeBoxORM(txn, storage, kbid)
@@ -115,7 +115,7 @@ class ExtractedTextCache(Cache[[str, FieldId], ExtractedText]):
     def __init__(self, cache_size: int) -> None:
         @alru_cache(maxsize=cache_size)
         @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
-        async def _get_extracted_text(kbid: str, field_id: FieldId) -> Optional[ExtractedText]:
+        async def _get_extracted_text(kbid: str, field_id: FieldId) -> ExtractedText | None:
             storage = await get_storage()
             try:
                 sf = storage.file_extracted(
@@ -144,18 +144,18 @@ class ExtractedTextCache(Cache[[str, FieldId], ExtractedText]):
 # Global caches (per asyncio task)
-rcache: ContextVar[Optional[ResourceCache]] = ContextVar("rcache", default=None)
-etcache: ContextVar[Optional[ExtractedTextCache]] = ContextVar("etcache", default=None)
+rcache: ContextVar[ResourceCache | None] = ContextVar("rcache", default=None)
+etcache: ContextVar[ExtractedTextCache | None] = ContextVar("etcache", default=None)
 # Cache management
-def get_resource_cache() -> Optional[ResourceCache]:
+def get_resource_cache() -> ResourceCache | None:
     return rcache.get()
-def get_extracted_text_cache() -> Optional[ExtractedTextCache]:
+def get_extracted_text_cache() -> ExtractedTextCache | None:
     return etcache.get()

nucliadb/common/catalog/__init__.py ADDED Viewed

@@ -0,0 +1,79 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+from nidx_protos.noderesources_pb2 import Resource as IndexMessage
+from nucliadb.common.catalog.dummy import DummyCatalog
+from nucliadb.common.catalog.interface import Catalog, CatalogQuery
+from nucliadb.common.catalog.pg import PGCatalog
+from nucliadb.common.catalog.utils import build_catalog_resource_data
+from nucliadb.common.maindb.driver import Transaction
+from nucliadb.ingest.orm.resource import Resource
+from nucliadb.ingest.settings import CatalogConfig, settings
+from nucliadb_models.search import CatalogFacetsRequest, Resources
+from nucliadb_utils.exceptions import ConfigurationError
+def get_catalog() -> Catalog:
+    if settings.catalog == CatalogConfig.UNSET:
+        return DummyCatalog()
+    elif settings.catalog == CatalogConfig.PG:
+        return PGCatalog()
+    else:
+        raise ConfigurationError(f"Unknown catalog configuration: {settings.catalog}")
+async def catalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
+    catalog = get_catalog()
+    resource_data = build_catalog_resource_data(resource, index_message)
+    await catalog.update(txn, kbid, resource.uuid, resource_data)
+async def catalog_delete(txn: Transaction, kbid: str, rid: str):
+    catalog = get_catalog()
+    await catalog.delete(txn, kbid, rid)
+async def catalog_search(query: CatalogQuery) -> Resources:
+    catalog = get_catalog()
+    return await catalog.search(query)
+async def catalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
+    catalog = get_catalog()
+    return await catalog.facets(kbid, request)

nucliadb/common/catalog/dummy.py ADDED Viewed

@@ -0,0 +1,36 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from nucliadb.common.catalog.interface import Catalog, CatalogQuery, CatalogResourceData
+from nucliadb.common.maindb.driver import Transaction
+from nucliadb_models.search import CatalogFacetsRequest, Resources
+class DummyCatalog(Catalog):
+    async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
+        return
+    async def delete(self, txn: Transaction, kbid: str, rid: str):
+        return
+    async def search(self, query: CatalogQuery) -> Resources:
+        return Resources(results=[], min_score=0.0)
+    async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
+        return {}

nucliadb/common/catalog/interface.py ADDED Viewed

@@ -0,0 +1,85 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from __future__ import annotations
+import abc
+import datetime
+from dataclasses import dataclass
+from typing import Literal
+from pydantic import BaseModel, Field
+from nucliadb.common.maindb.driver import Transaction
+from nucliadb_models import search as search_models
+from nucliadb_models.search import CatalogFacetsRequest, Resources
+class CatalogResourceData(BaseModel):
+    """
+    Data extracted from a resource to be indexed in the catalog
+    """
+    title: str = Field(description="Resource title")
+    created_at: datetime.datetime = Field(description="Resource creation date")
+    modified_at: datetime.datetime = Field(description="Resource last modification date")
+    labels: list[str] = Field(
+        description="Resource labels. This includes labels at the resource level and all classification labels of its fields"
+    )
+    slug: str = Field(description="Resource slug")
+@dataclass
+class CatalogExpression:
+    @dataclass
+    class Date:
+        field: Literal["created_at"] | Literal["modified_at"]
+        since: datetime.datetime | None
+        until: datetime.datetime | None
+    bool_and: list[CatalogExpression] | None = None
+    bool_or: list[CatalogExpression] | None = None
+    bool_not: CatalogExpression | None = None
+    date: Date | None = None
+    facet: str | None = None
+    resource_id: str | None = None
+class CatalogQuery(BaseModel):
+    kbid: str
+    query: search_models.CatalogQuery | None = Field(description="Full-text search query")
+    filters: CatalogExpression | None = Field(description="Filters to apply to the search")
+    sort: search_models.SortOptions = Field(description="Sorting option")
+    faceted: list[str] = Field(description="List of facets to compute during the search")
+    page_size: int = Field(description="Used for pagination. Maximum page size is 100")
+    page_number: int = Field(description="Used for pagination. First page is 0")
+class Catalog(abc.ABC, metaclass=abc.ABCMeta):
+    @abc.abstractmethod
+    async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData): ...
+    @abc.abstractmethod
+    async def delete(self, txn: Transaction, kbid: str, rid: str): ...
+    @abc.abstractmethod
+    async def search(self, query: CatalogQuery) -> Resources: ...
+    @abc.abstractmethod
+    async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]: ...

nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl