PyPI - nucliadb - Versions diffs - 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

migrations/0023_backfill_pg_catalog.py +2 -2
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +2 -2
migrations/0039_backfill_converation_splits_metadata.py +2 -2
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/interface.py +12 -12
nucliadb/common/catalog/pg.py +41 -29
nucliadb/common/catalog/utils.py +3 -3
nucliadb/common/cluster/manager.py +5 -4
nucliadb/common/cluster/rebalance.py +483 -114
nucliadb/common/cluster/rollover.py +25 -9
nucliadb/common/cluster/settings.py +3 -8
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +4 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +4 -5
nucliadb/common/filter_expression.py +128 -40
nucliadb/common/http_clients/processing.py +12 -23
nucliadb/common/ids.py +6 -4
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +3 -4
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +3 -8
nucliadb/ingest/consumer/service.py +3 -3
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +28 -49
nucliadb/ingest/fields/conversation.py +12 -12
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +78 -64
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +4 -4
nucliadb/ingest/orm/knowledgebox.py +18 -27
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +27 -27
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +72 -70
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +3 -109
nucliadb/ingest/settings.py +3 -4
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +11 -11
nucliadb/metrics_exporter.py +5 -4
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +3 -4
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/learning_config.py +24 -4
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +2 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +11 -15
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +25 -25
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +7 -7
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +24 -17
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -23
nucliadb/search/search/chat/ask.py +88 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +449 -36
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +3 -152
nucliadb/search/search/hydrator/fields.py +92 -50
nucliadb/search/search/hydrator/images.py +7 -7
nucliadb/search/search/hydrator/paragraphs.py +42 -26
nucliadb/search/search/hydrator/resources.py +20 -16
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +10 -9
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +13 -9
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -20
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +4 -5
nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
nucliadb/search/search/query_parser/parsers/common.py +5 -6
nucliadb/search/search/query_parser/parsers/find.py +6 -26
nucliadb/search/search/query_parser/parsers/graph.py +13 -23
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -53
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +5 -6
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +2 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +2 -2
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +7 -11
nucliadb/writer/api/v1/knowledgebox.py +3 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +7 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +1 -3
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +5 -6
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/common/datamanagers/utils.py CHANGED Viewed

@@ -18,7 +18,7 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import contextlib
-from typing import Optional, Type, TypeVar
+from typing import TypeVar
 from google.protobuf.message import Message
@@ -29,9 +29,9 @@ PB_TYPE = TypeVar("PB_TYPE", bound=Message)
 async def get_kv_pb(
-    txn: Transaction, key: str, pb_type: Type[PB_TYPE], for_update: bool = True
-) -> Optional[PB_TYPE]:
-    serialized: Optional[bytes] = await txn.get(key, for_update=for_update)
+    txn: Transaction, key: str, pb_type: type[PB_TYPE], for_update: bool = True
+) -> PB_TYPE | None:
+    serialized: bytes | None = await txn.get(key, for_update=for_update)
     if serialized is None:
         return None
     pb = pb_type()

nucliadb/common/datamanagers/vectorsets.py CHANGED Viewed

@@ -17,7 +17,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import AsyncIterator, Optional
+from collections.abc import AsyncIterator
 from nucliadb.common.datamanagers.utils import get_kv_pb
 from nucliadb.common.maindb.driver import Transaction
@@ -37,7 +37,7 @@ async def initialize(txn: Transaction, *, kbid: str):
 async def get(
     txn: Transaction, *, kbid: str, vectorset_id: str
-) -> Optional[knowledgebox_pb2.VectorSetConfig]:
+) -> knowledgebox_pb2.VectorSetConfig | None:
     kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
     index = _find_vectorset(kb_vectorsets, vectorset_id)
     if index is None:
@@ -80,7 +80,7 @@ async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSet
 async def delete(
     txn: Transaction, *, kbid: str, vectorset_id: str
-) -> Optional[knowledgebox_pb2.VectorSetConfig]:
+) -> knowledgebox_pb2.VectorSetConfig | None:
     kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
     index = _find_vectorset(kb_vectorsets, vectorset_id)
     if index is None:
@@ -111,7 +111,7 @@ async def _get_or_default(
 def _find_vectorset(
     kb_vectorsets: knowledgebox_pb2.KnowledgeBoxVectorSetsConfig, vectorset_id: str
-) -> Optional[int]:
+) -> int | None:
     """Return the position of the vectorset in `vectorsets` or `None` if not found."""
     for idx, vectorset in enumerate(kb_vectorsets.vectorsets):
         if vectorset.vectorset_id == vectorset_id:

nucliadb/common/external_index_providers/base.py CHANGED Viewed

@@ -19,8 +19,9 @@
 #
 import abc
 import logging
+from collections.abc import Iterator
 from dataclasses import dataclass
-from typing import Any, Iterator, Optional
+from typing import Any
 from nidx_protos.nodereader_pb2 import SearchRequest
 from nidx_protos.noderesources_pb2 import Resource
@@ -30,7 +31,9 @@ from nucliadb.common.counters import IndexCounts
 from nucliadb.common.external_index_providers.exceptions import ExternalIndexingError
 from nucliadb.common.ids import ParagraphId
 from nucliadb_models.external_index_providers import ExternalIndexProviderType
+from nucliadb_models.retrieval import Score
 from nucliadb_models.search import SCORE_TYPE, Relations, TextPosition
+from nucliadb_protos import resources_pb2
 from nucliadb_protos.knowledgebox_pb2 import (
     CreateExternalIndexProviderMetadata,
     StoredExternalIndexProviderMetadata,
@@ -43,6 +46,16 @@ logger = logging.getLogger(__name__)
 manager_observer = Observer("external_index_manager", labels={"operation": "", "provider": ""})
+# /k/ocr
+_OCR_LABEL = (
+    f"/k/{resources_pb2.Paragraph.TypeParagraph.Name(resources_pb2.Paragraph.TypeParagraph.OCR).lower()}"
+)
+# /k/inception
+_INCEPTION_LABEL = (
+    f"/k/{resources_pb2.Paragraph.TypeParagraph.Name(resources_pb2.Paragraph.TypeParagraph.OCR).lower()}"
+)
 @dataclass
 class VectorsetExternalIndex:
     """
@@ -57,9 +70,19 @@ class VectorsetExternalIndex:
 class ScoredTextBlock(BaseModel):
     paragraph_id: ParagraphId
-    score: float
     score_type: SCORE_TYPE
+    scores: list[Score]
+    @property
+    def score(self) -> float:
+        return self.current_score.score
+    @property
+    def current_score(self) -> Score:
+        assert len(self.scores) > 0, "text block matches must be scored"
+        return self.scores[-1]
 class TextBlockMatch(ScoredTextBlock):
     """
@@ -72,11 +95,15 @@ class TextBlockMatch(ScoredTextBlock):
     page_with_visual: bool = False
     fuzzy_search: bool
     is_a_table: bool = False
-    representation_file: Optional[str] = None
+    representation_file: str | None = None
     paragraph_labels: list[str] = []
     field_labels: list[str] = []
-    text: Optional[str] = None
-    relevant_relations: Optional[Relations] = None
+    text: str | None = None
+    relevant_relations: Relations | None = None
+    @property
+    def is_an_image(self) -> bool:
+        return _OCR_LABEL in self.paragraph_labels or _INCEPTION_LABEL in self.paragraph_labels
 class QueryResults(BaseModel):

nucliadb/common/external_index_providers/manager.py CHANGED Viewed

@@ -17,7 +17,6 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Optional
 import async_lru
@@ -30,7 +29,7 @@ from nucliadb_protos.knowledgebox_pb2 import (
 async def get_external_index_manager(
     kbid: str, for_rollover: bool = False
-) -> Optional[ExternalIndexManager]:
+) -> ExternalIndexManager | None:
     """
     Returns an ExternalIndexManager for the given kbid.
     If for_rollover is True, the ExternalIndexManager returned will include the rollover indexes (if any).
@@ -39,12 +38,12 @@ async def get_external_index_manager(
 @async_lru.alru_cache(maxsize=None)
-async def get_external_index_metadata(kbid: str) -> Optional[StoredExternalIndexProviderMetadata]:
+async def get_external_index_metadata(kbid: str) -> StoredExternalIndexProviderMetadata | None:
     return await datamanagers.atomic.kb.get_external_index_provider_metadata(kbid=kbid)
 @async_lru.alru_cache(maxsize=None)
-async def get_default_vectorset_id(kbid: str) -> Optional[str]:
+async def get_default_vectorset_id(kbid: str) -> str | None:
     """
     While we are transitioning to the new vectorset system, we need to take into account
     that KBs that have only one semantic model will have the `vectorset_id` field on BrokerMessage.field_vectors
@@ -68,6 +67,6 @@ async def get_default_vectorset_id(kbid: str) -> Optional[str]:
 async def get_rollover_external_index_metadata(
     kbid: str,
-) -> Optional[StoredExternalIndexProviderMetadata]:
+) -> StoredExternalIndexProviderMetadata | None:
     async with datamanagers.with_ro_transaction() as txn:
         return await datamanagers.rollover.get_kb_rollover_external_index_metadata(txn, kbid=kbid)

nucliadb/common/filter_expression.py CHANGED Viewed

@@ -18,13 +18,14 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Union
 from nidx_protos.nodereader_pb2 import FilterExpression as PBFilterExpression
+from typing_extensions import assert_never
 from nucliadb.common import datamanagers
 from nucliadb.common.exceptions import InvalidQueryError
 from nucliadb.common.ids import FIELD_TYPE_NAME_TO_STR
+from nucliadb_models.common import Paragraph
 from nucliadb_models.filters import (
     And,
     DateCreated,
@@ -50,44 +51,28 @@ from nucliadb_models.filters import (
     ResourceMimetype,
     Status,
 )
+from nucliadb_models.metadata import ResourceProcessingStatus
 # Filters that end up as a facet
-FacetFilter = Union[
-    OriginTag,
-    Label,
-    ResourceMimetype,
-    FieldMimetype,
-    Entity,
-    Language,
-    OriginMetadata,
-    OriginPath,
-    Generated,
-    Kind,
-    OriginCollaborator,
-    OriginSource,
-    Status,
-]
-# In Python 3.9 we cannot do isinstance against an union
-# Once we support only 3.10+, we can remove this
-FacetFilterTypes = (
-    OriginTag,
-    Label,
-    ResourceMimetype,
-    FieldMimetype,
-    Entity,
-    Language,
-    OriginMetadata,
-    OriginPath,
-    Generated,
-    Kind,
-    OriginCollaborator,
-    OriginSource,
-    Status,
+FacetFilter = (
+    OriginTag
+    | Label
+    | ResourceMimetype
+    | FieldMimetype
+    | Entity
+    | Language
+    | OriginMetadata
+    | OriginPath
+    | Generated
+    | Kind
+    | OriginCollaborator
+    | OriginSource
+    | Status
 )
 async def parse_expression(
-    expr: Union[FieldFilterExpression, ParagraphFilterExpression],
+    expr: FieldFilterExpression | ParagraphFilterExpression,
     kbid: str,
 ) -> PBFilterExpression:
     f = PBFilterExpression()
@@ -131,12 +116,10 @@ async def parse_expression(
             f.date.since.FromDatetime(expr.since)
         if expr.until:
             f.date.until.FromDatetime(expr.until)
-    elif isinstance(expr, FacetFilterTypes):
+    elif isinstance(expr, FacetFilter):
         f.facet.facet = facet_from_filter(expr)
     else:
-        # This is a trick so mypy generates an error if this branch can be reached,
-        # that is, if we are missing some ifs
-        _a: int = "a"
+        assert_never(expr)
     return f
@@ -190,13 +173,118 @@ def facet_from_filter(expr: FacetFilter) -> str:
     elif isinstance(expr, Status):
         facet = f"/n/s/{expr.status.value}"
     else:
-        # This is a trick so mypy generates an error if this branch can be reached,
-        # that is, if we are missing some ifs
-        _a: int = "a"
+        assert_never(expr)
     return facet
+def filter_from_facet(facet: str) -> FacetFilter:
+    expr: FacetFilter
+    if facet.startswith("/t/"):
+        value = facet.removeprefix("/t/")
+        expr = OriginTag(tag=value)
+    elif facet.startswith("/l/"):
+        value = facet.removeprefix("/l/")
+        parts = value.split("/", maxsplit=1)
+        if len(parts) == 1:
+            type = parts[0]
+            expr = Label(labelset=type)
+        else:
+            type, subtype = parts
+            expr = Label(labelset=type, label=subtype)
+    elif facet.startswith("/n/i/"):
+        value = facet.removeprefix("/n/i/")
+        parts = value.split("/", maxsplit=1)
+        if len(parts) == 1:
+            type = parts[0]
+            expr = ResourceMimetype(type=type)
+        else:
+            type, subtype = parts
+            expr = ResourceMimetype(type=type, subtype=subtype)
+    elif facet.startswith("/mt/"):
+        value = facet.removeprefix("/mt/")
+        parts = value.split("/", maxsplit=1)
+        if len(parts) == 1:
+            type = parts[0]
+            expr = FieldMimetype(type=type)
+        else:
+            type, subtype = parts
+            expr = FieldMimetype(type=type, subtype=subtype)
+    elif facet.startswith("/e/"):
+        value = facet.removeprefix("/e/")
+        parts = value.split("/", maxsplit=1)
+        if len(parts) == 1:
+            subtype = parts[0]
+            expr = Entity(subtype=subtype)
+        else:
+            subtype, value = parts
+            expr = Entity(subtype=subtype, value=value)
+    elif facet.startswith("/s/p"):
+        value = facet.removeprefix("/s/p/")
+        expr = Language(language=value, only_primary=True)
+    elif facet.startswith("/s/s"):
+        value = facet.removeprefix("/s/s/")
+        expr = Language(language=value, only_primary=False)
+    elif facet.startswith("/m/"):
+        value = facet.removeprefix("/m/")
+        parts = value.split("/", maxsplit=1)
+        if len(parts) == 1:
+            field = parts[0]
+            expr = OriginMetadata(field=field)
+        else:
+            field, value = parts
+            expr = OriginMetadata(field=field, value=value)
+    elif facet.startswith("/p/"):
+        value = facet.removeprefix("/p/")
+        expr = OriginPath(prefix=value)
+    elif facet.startswith("/g/da"):
+        value = facet.removeprefix("/g/da")
+        expr = expr = Generated(by="data-augmentation")
+        if value.removeprefix("/"):
+            expr.da_task = value.removeprefix("/")
+    elif facet.startswith("/k/"):
+        value = facet.removeprefix("/k/")
+        try:
+            kind = Paragraph.TypeParagraph(value.upper())
+        except ValueError:
+            raise InvalidQueryError("filters", f"invalid paragraph kind: {value}")
+        expr = Kind(kind=kind)
+    elif facet.startswith("/u/o/"):
+        value = facet.removeprefix("/u/o/")
+        expr = OriginCollaborator(collaborator=value)
+    elif facet.startswith("/u/s"):
+        value = facet.removeprefix("/u/s")
+        expr = OriginSource()
+        if value.removeprefix("/"):
+            expr.id = value.removeprefix("/")
+    elif facet.startswith("/n/s/"):
+        value = facet.removeprefix("/n/s/")
+        try:
+            status = ResourceProcessingStatus(value.upper())
+        except ValueError:
+            raise InvalidQueryError("filters", f"invalid resource processing status: {value}")
+        expr = Status(status=status)
+    else:
+        raise InvalidQueryError("filters", f"invalid filter: {facet}")
+    return expr
 def add_and_expression(dest: PBFilterExpression, add: PBFilterExpression):
     dest_expr_type = dest.WhichOneof("expr")
     if dest_expr_type is None:

nucliadb/common/http_clients/processing.py CHANGED Viewed

@@ -19,10 +19,8 @@
 #
 import logging
 from datetime import datetime
-from typing import Optional
 import aiohttp
-import jwt
 import pydantic
 from nucliadb_utils.helpers import MessageProgressUpdater
@@ -33,15 +31,6 @@ from .utils import check_status
 logger = logging.getLogger(__name__)
-def get_nua_api_id() -> str:
-    assert nuclia_settings.nuclia_service_account is not None
-    claimset = jwt.decode(
-        nuclia_settings.nuclia_service_account,
-        options={"verify_signature": False},
-    )
-    return claimset.get("sub")
 def get_processing_api_url() -> str:
     if nuclia_settings.nuclia_service_account:
         return (
@@ -64,10 +53,10 @@ def get_processing_api_v2_url() -> str:
 class PullResponse(pydantic.BaseModel):
     status: str
-    payload: Optional[str] = None
+    payload: str | None = None
     payloads: list[bytes] = []
-    msgid: Optional[str] = None
-    cursor: Optional[int] = None
+    msgid: str | None = None
+    cursor: int | None = None
 class PullPosition(pydantic.BaseModel):
@@ -86,7 +75,7 @@ class RequestsResult(pydantic.BaseModel):
         description="Resource ID.",
     )
     kbid: str = pydantic.Field(..., title="KnowledgeBox ID")
-    title: Optional[str] = pydantic.Field(
+    title: str | None = pydantic.Field(
         None,
         title="Title",
         description="Title of the resource.",
@@ -111,12 +100,12 @@ class RequestsResult(pydantic.BaseModel):
         title="Timestamp",
         description="Timestamp of when the resource was first scheduled.",
     )
-    completed_at: Optional[datetime] = pydantic.Field(
+    completed_at: datetime | None = pydantic.Field(
         None,
         title="Completed At",
         description="Timestamp of when the resource was completed",
     )
-    scheduled_at: Optional[datetime] = pydantic.Field(
+    scheduled_at: datetime | None = pydantic.Field(
         None,
         title="Scheduled At",
         description="Timestamp of when the resource was first scheduled.",
@@ -149,7 +138,7 @@ class RequestsResults(pydantic.BaseModel):
         title="Results",
         description="List of results.",
     )
-    cursor: Optional[str] = pydantic.Field(
+    cursor: str | None = pydantic.Field(
         None,
         title="Cursor",
         description="Cursor to use for the next page of results.",
@@ -224,7 +213,7 @@ class ProcessingHTTPClient:
     async def pull_v2(
         self, ack_tokens: list[str], limit: int = 1, timeout: float = 5
-    ) -> Optional[PullResponseV2]:
+    ) -> PullResponseV2 | None:
         url = self.base_url_v2 + "/pull"
         request = PullRequestV2(limit=limit, timeout=timeout, ack=ack_tokens)
         async with self.session.post(
@@ -248,9 +237,9 @@ class ProcessingHTTPClient:
     async def requests(
         self,
-        cursor: Optional[str] = None,
-        scheduled: Optional[bool] = None,
-        kbid: Optional[str] = None,
+        cursor: str | None = None,
+        scheduled: bool | None = None,
+        kbid: str | None = None,
         limit: int = 20,
     ) -> RequestsResults:
         url = self.base_url + "/requests"
@@ -267,7 +256,7 @@ class ProcessingHTTPClient:
             check_status(resp, resp_text)
             return RequestsResults.model_validate_json(resp_text)
-    async def stats(self, kbid: str, timeout: Optional[float] = 1.0) -> StatsResponse:
+    async def stats(self, kbid: str, timeout: float | None = 1.0) -> StatsResponse:
         url = self.base_url + "/stats"
         async with self.session.get(
             url,

nucliadb/common/ids.py CHANGED Viewed

@@ -24,7 +24,6 @@ paragraphs... Avoiding spread of id construction and parsing everywhere
 """
 from dataclasses import dataclass
-from typing import Optional
 from nucliadb_models.common import FieldTypeName
 from nucliadb_protos.resources_pb2 import FieldType
@@ -77,7 +76,7 @@ class FieldId:
     type: str
     key: str
     # also knwon as `split`, this indicates a part of a field in, for example, conversations
-    subfield_id: Optional[str] = None
+    subfield_id: str | None = None
     @classmethod
     def from_string(cls, value: str) -> "FieldId":
@@ -113,7 +112,7 @@ class FieldId:
     @classmethod
     def from_pb(
-        cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
+        cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: str | None = None
     ) -> "FieldId":
         return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
@@ -127,6 +126,9 @@ class FieldId:
         else:
             return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
+    def full_without_subfield(self) -> str:
+        return f"{self.rid}/{self.type}/{self.key}"
     def short_without_subfield(self) -> str:
         return f"/{self.type}/{self.key}"
@@ -262,7 +264,7 @@ class VectorId:
         return hash(self.full())
-def extract_data_augmentation_id(generated_field_id: str) -> Optional[str]:
+def extract_data_augmentation_id(generated_field_id: str) -> str | None:
     """Data augmentation generated fields have a strict id with the following
     format:
     `da-{task_id}-{original:field_type}-{original:field_id}[-{original:split}]`

nucliadb/common/locking.py CHANGED Viewed

@@ -22,7 +22,6 @@ import logging
 import time
 import uuid
 from dataclasses import dataclass
-from typing import Optional
 import orjson
@@ -99,7 +98,7 @@ class _Lock:
         self.task = asyncio.create_task(self._refresh_task())
         return self
-    async def get_lock_data(self, txn: Transaction) -> Optional[LockValue]:
+    async def get_lock_data(self, txn: Transaction) -> LockValue | None:
         existing_data = await txn.get(self.key, for_update=True)
         if existing_data is None:
             return None

nucliadb/common/maindb/driver.py CHANGED Viewed

@@ -20,8 +20,9 @@
 from __future__ import annotations
 import asyncio
+from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
-from typing import AsyncGenerator, Optional
+from typing import ClassVar
 DEFAULT_SCAN_LIMIT = -1
 DEFAULT_BATCH_SCAN_LIMIT = 500
@@ -37,10 +38,10 @@ class Transaction:
     async def commit(self):
         raise NotImplementedError()
-    async def batch_get(self, keys: list[str], for_update: bool = False) -> list[Optional[bytes]]:
+    async def batch_get(self, keys: list[str], for_update: bool = False) -> list[bytes | None]:
         raise NotImplementedError()
-    async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
+    async def get(self, key: str, for_update: bool = False) -> bytes | None:
         raise NotImplementedError()
     async def set(self, key: str, value: bytes):
@@ -57,7 +58,7 @@ class Transaction:
     def keys(
         self, match: str, count: int = DEFAULT_SCAN_LIMIT, include_start: bool = True
-    ) -> AsyncGenerator[str, None]:
+    ) -> AsyncGenerator[str]:
         raise NotImplementedError()
     async def count(self, match: str) -> int:
@@ -66,7 +67,7 @@ class Transaction:
 class Driver:
     initialized = False
-    _abort_tasks: list[asyncio.Task] = []
+    _abort_tasks: ClassVar[list[asyncio.Task]] = []
     async def initialize(self):
         raise NotImplementedError()
@@ -81,15 +82,15 @@ class Driver:
                     pass
     @asynccontextmanager
-    async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
+    async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction]:
         yield Transaction()
     @asynccontextmanager
-    async def ro_transaction(self) -> AsyncGenerator[Transaction, None]:
+    async def ro_transaction(self) -> AsyncGenerator[Transaction]:
         async with self._transaction(read_only=True) as txn:
             yield txn
     @asynccontextmanager
-    async def rw_transaction(self) -> AsyncGenerator[Transaction, None]:
+    async def rw_transaction(self) -> AsyncGenerator[Transaction]:
         async with self._transaction(read_only=False) as txn:
             yield txn

nucliadb/common/maindb/local.py CHANGED Viewed

@@ -19,8 +19,8 @@
 #
 import glob
 import os
+from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
-from typing import AsyncGenerator, Optional
 from nucliadb.common.maindb.driver import (
     DEFAULT_BATCH_SCAN_LIMIT,
@@ -78,7 +78,7 @@ class LocalTransaction(Transaction):
             # Deleting a key that does not exist
             pass
-    async def read(self, key: str) -> Optional[bytes]:
+    async def read(self, key: str) -> bytes | None:
         try:
             async with aiofiles.open(self.compute_path(key), "rb") as resp:
                 return await resp.read()
@@ -106,8 +106,8 @@ class LocalTransaction(Transaction):
         self.clean()
         self.open = False
-    async def batch_get(self, keys: list[str], for_update: bool = False) -> list[Optional[bytes]]:
-        results: list[Optional[bytes]] = []
+    async def batch_get(self, keys: list[str], for_update: bool = False) -> list[bytes | None]:
+        results: list[bytes | None] = []
         for key in keys:
             obj = await self.get(key)
             if obj:
@@ -125,7 +125,7 @@ class LocalTransaction(Transaction):
         return results
-    async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
+    async def get(self, key: str, for_update: bool = False) -> bytes | None:
         if key in self.deleted_keys:
             raise KeyError(f"Not found {key}")

nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl