PyPI - nucliadb - Versions diffs - 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (246) hide show

migrations/0023_backfill_pg_catalog.py +8 -4
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +8 -4
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +8 -23
nucliadb/common/cluster/rebalance.py +484 -112
nucliadb/common/cluster/rollover.py +36 -9
nucliadb/common/cluster/settings.py +4 -9
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +9 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +5 -34
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +129 -41
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +16 -23
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +82 -58
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +22 -5
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +10 -8
nucliadb/ingest/consumer/service.py +5 -30
nucliadb/ingest/consumer/shard_creator.py +16 -5
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +37 -49
nucliadb/ingest/fields/conversation.py +55 -9
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +89 -57
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +128 -113
nucliadb/ingest/orm/knowledgebox.py +91 -59
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +98 -153
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +82 -71
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +15 -114
nucliadb/ingest/settings.py +36 -15
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +23 -26
nucliadb/metrics_exporter.py +20 -6
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +4 -11
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +37 -9
nucliadb/reader/api/v1/learning_config.py +33 -14
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +3 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +15 -19
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +28 -8
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +33 -19
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -42
nucliadb/search/search/chat/ask.py +131 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +453 -32
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +49 -0
nucliadb/search/search/hydrator/fields.py +217 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +323 -0
nucliadb/search/search/hydrator/resources.py +60 -0
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +24 -7
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +44 -18
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -48
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +5 -6
nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
nucliadb/search/search/query_parser/parsers/common.py +21 -13
nucliadb/search/search/query_parser/parsers/find.py +6 -29
nucliadb/search/search/query_parser/parsers/graph.py +18 -28
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -56
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +6 -7
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +5 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +4 -10
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +15 -14
nucliadb/writer/api/v1/knowledgebox.py +18 -56
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +43 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +5 -7
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +15 -22
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
nucliadb/search/search/hydrator.py +0 -197
nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/search/api/v1/augment.py ADDED Viewed

@@ -0,0 +1,585 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+import asyncio
+from typing import cast
+from fastapi import Header, Request
+from fastapi_versioning import version
+from nucliadb.common.ids import FieldId, ParagraphId
+from nucliadb.models.internal import augment as internal_augment
+from nucliadb.models.internal.augment import (
+    Augment,
+    Augmented,
+    ConversationAnswerOrAfter,
+    ConversationAttachments,
+    ConversationAugment,
+    ConversationProp,
+    ConversationSelector,
+    ConversationText,
+    DeepResourceAugment,
+    FieldAugment,
+    FieldClassificationLabels,
+    FieldEntities,
+    FieldProp,
+    FieldText,
+    FileAugment,
+    FileProp,
+    FileThumbnail,
+    FullSelector,
+    MessageSelector,
+    Metadata,
+    Paragraph,
+    ParagraphAugment,
+    ParagraphImage,
+    ParagraphPage,
+    ParagraphPosition,
+    ParagraphProp,
+    ParagraphTable,
+    ParagraphText,
+    RelatedParagraphs,
+    ResourceAugment,
+    ResourceClassificationLabels,
+    ResourceProp,
+    ResourceSummary,
+    ResourceTitle,
+    WindowSelector,
+)
+from nucliadb.search.api.v1.router import KB_PREFIX, api
+from nucliadb.search.augmentor import augmentor
+from nucliadb.search.search.cache import request_caches
+from nucliadb_models.augment import (
+    AugmentedConversationField,
+    AugmentedConversationMessage,
+    AugmentedField,
+    AugmentedFileField,
+    AugmentedParagraph,
+    AugmentedResource,
+    AugmentParagraphs,
+    AugmentRequest,
+    AugmentResources,
+    AugmentResponse,
+)
+from nucliadb_models.common import FieldTypeName
+from nucliadb_models.resource import ExtractedDataTypeName, NucliaDBRoles
+from nucliadb_models.search import NucliaDBClientType, ResourceProperties
+from nucliadb_utils.authentication import requires
+@api.post(
+    f"/{KB_PREFIX}/{{kbid}}/augment",
+    status_code=200,
+    description="Augment data on a Knowledge Box",
+    include_in_schema=False,
+    tags=["Augment"],
+)
+@requires(NucliaDBRoles.READER)
+@version(1)
+async def _augment_endpoint(
+    request: Request,
+    kbid: str,
+    item: AugmentRequest,
+    x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
+    x_nucliadb_user: str = Header(""),
+    x_forwarded_for: str = Header(""),
+) -> AugmentResponse:
+    return await augment_endpoint(kbid, item)
+async def augment_endpoint(kbid: str, item: AugmentRequest) -> AugmentResponse:
+    augmentations = parse_first_augments(item)
+    if len(augmentations) == 0:
+        return AugmentResponse(resources={}, fields={}, paragraphs={})
+    with request_caches():
+        max_ops = asyncio.Semaphore(50)
+        first_augmented = await augmentor.augment(kbid, augmentations, concurrency_control=max_ops)
+        response = build_augment_response(item, first_augmented)
+        # 2nd round trip to augmentor
+        #
+        # There are some augmentations that require some augmented content to be
+        # able to keep augmenting, as neighbour paragraphs.
+        #
+        # However, as many data is already cached (when using cache), this
+        # second round should be orders of magnitude faster than the first round.
+        #
+        augmentations = parse_second_augments(item, first_augmented)
+        if len(augmentations) > 0:
+            second_augmented = await augmentor.augment(kbid, augmentations, concurrency_control=max_ops)
+            merge_second_augment(item, response, second_augmented)
+    return response
+def parse_first_augments(item: AugmentRequest) -> list[Augment]:
+    """Parse an augment request and return a list of internal augments to
+    fulfill as much as the requested information as it can.
+    Notice there are augments that will require a 2nd round trip to the
+    augmentor, e.g., neighbouring paragraphs. This makes code a bit more
+    convoluted but avoids synchronization between augments, as many paragraphs
+    could lead to the same neighbours.
+    """
+    augmentations: list[Augment] = []
+    if item.resources is not None:
+        for resource_augment in item.resources:
+            show, extracted, resource_select = parse_deep_resource_augment(resource_augment)
+            if resource_augment.field_type_filter is None:
+                field_type_filter = list(FieldTypeName)
+            else:
+                field_type_filter = resource_augment.field_type_filter
+            if show:
+                augmentations.append(
+                    DeepResourceAugment(
+                        given=resource_augment.given,
+                        show=show,
+                        extracted=extracted,
+                        field_type_filter=field_type_filter,
+                    )
+                )
+            if resource_select:
+                augmentations.append(
+                    ResourceAugment(
+                        given=resource_augment.given,  # type: ignore[arg-type]
+                        select=resource_select,
+                    )
+                )
+            if resource_augment.fields is not None:
+                # Augment resource fields with an optional field filter
+                field_select: list[FieldProp] = []
+                if resource_augment.fields.text:
+                    field_select.append(FieldText())
+                if resource_augment.fields.classification_labels:
+                    field_select.append(FieldClassificationLabels())
+                augmentations.append(
+                    FieldAugment(
+                        given=resource_augment.given,  # type: ignore[arg-type]
+                        select=field_select,  # type: ignore[arg-type]
+                        filter=resource_augment.fields.filters,
+                    )
+                )
+    if item.fields is not None:
+        for field_augment in item.fields:
+            given = [FieldId.from_string(id) for id in field_augment.given]
+            select: list[FieldProp] = []
+            if field_augment.text:
+                select.append(FieldText())
+            if field_augment.entities:
+                select.append(FieldEntities())
+            if field_augment.classification_labels:
+                select.append(FieldClassificationLabels())
+            if len(select) > 0:
+                augmentations.append(
+                    FieldAugment(
+                        given=given,
+                        select=select,
+                    )
+                )
+            file_select: list[FileProp] = []
+            if field_augment.file_thumbnail:
+                file_select.append(FileThumbnail())
+            if len(file_select) > 0:
+                augmentations.append(
+                    FileAugment(
+                        given=given,  # type: ignore
+                        select=file_select,
+                    )
+                )
+            conversation_select: list[ConversationProp] = []
+            selector: ConversationSelector
+            if field_augment.full_conversation:
+                selector = FullSelector()
+                conversation_select.append(ConversationText(selector=selector))
+                if (
+                    field_augment.conversation_text_attachments
+                    or field_augment.conversation_image_attachments
+                ):
+                    conversation_select.append(ConversationAttachments(selector=selector))
+            elif field_augment.max_conversation_messages is not None:
+                # we want to always get the first conversation and the window
+                # requested by the user
+                first_selector = MessageSelector(index="first")
+                window_selector = WindowSelector(size=field_augment.max_conversation_messages)
+                conversation_select.append(ConversationText(selector=first_selector))
+                conversation_select.append(ConversationText(selector=window_selector))
+                if (
+                    field_augment.conversation_text_attachments
+                    or field_augment.conversation_image_attachments
+                ):
+                    conversation_select.append(ConversationAttachments(selector=first_selector))
+                    conversation_select.append(ConversationAttachments(selector=window_selector))
+            if field_augment.conversation_answer_or_messages_after:
+                conversation_select.append(ConversationAnswerOrAfter())
+            if len(conversation_select) > 0:
+                augmentations.append(
+                    ConversationAugment(
+                        given=given,  # type: ignore
+                        select=conversation_select,
+                    )
+                )
+    if item.paragraphs is not None:
+        for paragraph_augment in item.paragraphs:
+            paragraphs_to_augment, paragraph_selector = parse_paragraph_augment(paragraph_augment)
+            augmentations.append(
+                ParagraphAugment(
+                    given=paragraphs_to_augment,
+                    select=paragraph_selector,
+                )
+            )
+    return augmentations
+def parse_deep_resource_augment(
+    item: AugmentResources,
+) -> tuple[list[ResourceProperties], list[ExtractedDataTypeName], list[ResourceProp]]:
+    show = []
+    if item.basic:
+        show.append(ResourceProperties.BASIC)
+    if item.origin:
+        show.append(ResourceProperties.ORIGIN)
+    if item.extra:
+        show.append(ResourceProperties.EXTRA)
+    if item.relations:
+        show.append(ResourceProperties.RELATIONS)
+    if item.values:
+        show.append(ResourceProperties.VALUES)
+    if item.errors:
+        show.append(ResourceProperties.ERRORS)
+    if item.security:
+        show.append(ResourceProperties.SECURITY)
+    extracted = []
+    if item.extracted_text:
+        extracted.append(ExtractedDataTypeName.TEXT)
+    if item.extracted_metadata:
+        extracted.append(ExtractedDataTypeName.METADATA)
+    if item.extracted_shortened_metadata:
+        extracted.append(ExtractedDataTypeName.SHORTENED_METADATA)
+    if item.extracted_large_metadata:
+        extracted.append(ExtractedDataTypeName.LARGE_METADATA)
+    if item.extracted_vector:
+        extracted.append(ExtractedDataTypeName.VECTOR)
+    if item.extracted_link:
+        extracted.append(ExtractedDataTypeName.LINK)
+    if item.extracted_file:
+        extracted.append(ExtractedDataTypeName.FILE)
+    if item.extracted_qa:
+        extracted.append(ExtractedDataTypeName.QA)
+    if len(extracted) > 0:
+        show.append(ResourceProperties.EXTRACTED)
+    select: list[ResourceProp] = []
+    if item.title:
+        select.append(ResourceTitle())
+    if item.summary:
+        select.append(ResourceSummary())
+    if item.classification_labels:
+        select.append(ResourceClassificationLabels())
+    return (
+        show,
+        extracted,
+        select,
+    )
+def parse_paragraph_augment(item: AugmentParagraphs) -> tuple[list[Paragraph], list[ParagraphProp]]:
+    paragraphs_to_augment = []
+    for paragraph in item.given:
+        try:
+            paragraph_id = ParagraphId.from_string(paragraph.id)
+        except ValueError:
+            # invalid paragraph id, skipping
+            continue
+        if paragraph.metadata is None:
+            metadata = None
+        else:
+            metadata = Metadata(
+                is_an_image=paragraph.metadata.is_an_image,
+                is_a_table=paragraph.metadata.is_a_table,
+                source_file=paragraph.metadata.source_file,
+                page=paragraph.metadata.page,
+                in_page_with_visual=paragraph.metadata.in_page_with_visual,
+            )
+        paragraphs_to_augment.append(Paragraph(id=paragraph_id, metadata=metadata))
+    selector: list[ParagraphProp] = []
+    if item.text:
+        selector.append(ParagraphText())
+    if item.neighbours_before or item.neighbours_after:
+        selector.append(
+            RelatedParagraphs(
+                neighbours_before=item.neighbours_before or 0,
+                neighbours_after=item.neighbours_after or 0,
+            )
+        )
+    if item.source_image:
+        selector.append(ParagraphImage())
+    if item.table_image:
+        selector.append(ParagraphTable(prefer_page_preview=item.table_prefers_page_preview))
+    if item.page_preview_image:
+        selector.append(ParagraphPage(preview=True))
+    return paragraphs_to_augment, selector
+def build_augment_response(item: AugmentRequest, augmented: Augmented) -> AugmentResponse:
+    response = AugmentResponse(
+        resources={},
+        fields={},
+        paragraphs={},
+    )
+    # start with deep resources, as they return a Resource object we can merge
+    # with the augmented model
+    for rid, resource_deep in augmented.resources_deep.items():
+        if resource_deep is None:
+            continue
+        augmented_resource = AugmentedResource(id=rid)
+        augmented_resource.updated_from(resource_deep)
+        response.resources[rid] = augmented_resource
+    # now we can cherry pick properties from the augmented resources and merge
+    # them with the deep ones
+    for rid, resource in augmented.resources.items():
+        if resource is None:
+            continue
+        augmented_resource = response.resources.setdefault(rid, AugmentedResource(id=rid))
+        # merge resource with deep resources without overwriting
+        augmented_resource.title = augmented_resource.title or resource.title
+        augmented_resource.summary = augmented_resource.summary or resource.summary
+        # properties original to the augmented resources (not in deep resource augment)
+        if resource.classification_labels is not None:
+            augmented_resource.classification_labels = {
+                labelset: list(labels) for labelset, labels in resource.classification_labels.items()
+            }
+    for field_id, field in augmented.fields.items():
+        if field is None:
+            continue
+        # common augments for all fields
+        if field.classification_labels is None:
+            classification_labels = None
+        else:
+            classification_labels = {
+                labelset: list(labels) for labelset, labels in field.classification_labels.items()
+            }
+        if field.entities is None:
+            entities = None
+        else:
+            entities = {family: list(entity) for family, entity in field.entities.items()}
+        if field_id.type in (
+            FieldTypeName.TEXT.abbreviation(),
+            FieldTypeName.LINK.abbreviation(),
+            FieldTypeName.GENERIC.abbreviation(),
+        ):
+            response.fields[field_id.full()] = AugmentedField(
+                text=field.text,  # type: ignore # field is instance of any of the above and has the text property
+                classification_labels=classification_labels,
+                entities=entities,
+            )
+        elif field_id.type == FieldTypeName.FILE.abbreviation():
+            field = cast(internal_augment.AugmentedFileField, field)
+            response.fields[field_id.full()] = AugmentedFileField(
+                text=field.text,  # type: ignore # field is instance of any of the above and has the text property
+                classification_labels=classification_labels,
+                entities=entities,
+                thumbnail_image=field.thumbnail_path,
+            )
+        elif field_id.type == FieldTypeName.CONVERSATION.abbreviation():
+            field = cast(internal_augment.AugmentedConversationField, field)
+            conversation = AugmentedConversationField(
+                classification_labels=classification_labels,
+                entities=entities,
+            )
+            if field.messages is not None:
+                conversation.messages = []
+                for m in field.messages:
+                    if m.attachments is None:
+                        attachments = None
+                    else:
+                        attachments = []
+                        for f in m.attachments:
+                            attachments.append(f.full())
+                    conversation.messages.append(
+                        AugmentedConversationMessage(
+                            ident=m.ident,
+                            text=m.text,
+                            attachments=attachments,
+                        )
+                    )
+            response.fields[field_id.full()] = conversation
+        else:  # pragma: no cover
+            assert False, f"unknown field type: {field_id.type}"
+    for paragraph_id, paragraph in augmented.paragraphs.items():
+        if paragraph is None:
+            continue
+        augmented_paragraph = AugmentedParagraph()
+        augmented_paragraph.text = paragraph.text
+        if paragraph.related is not None:
+            augmented_paragraph.neighbours_before = list(
+                map(lambda x: x.full(), paragraph.related.neighbours_before)
+            )
+            augmented_paragraph.neighbours_after = list(
+                map(lambda x: x.full(), paragraph.related.neighbours_after)
+            )
+        augmented_paragraph.source_image = paragraph.source_image_path
+        augmented_paragraph.table_image = paragraph.table_image_path
+        augmented_paragraph.page_preview_image = paragraph.page_preview_path
+        response.paragraphs[paragraph_id.full()] = augmented_paragraph
+    return response
+def parse_second_augments(item: AugmentRequest, augmented: Augmented) -> list[Augment]:
+    """Given an augment request an a first augmentation, return a list of
+    augments required to fulfill the requested data.
+    """
+    augmentations: list[Augment] = []
+    for paragraph_augment in item.paragraphs or []:
+        if paragraph_augment.neighbours_before or paragraph_augment.neighbours_after:
+            neighbours = []
+            for paragraph_id, paragraph in augmented.paragraphs.items():
+                if paragraph.related is not None:
+                    for neighbour_before in paragraph.related.neighbours_before:
+                        neighbours.append(Paragraph(id=neighbour_before, metadata=None))
+                    for neighbour_after in paragraph.related.neighbours_after:
+                        neighbours.append(Paragraph(id=neighbour_after, metadata=None))
+            if neighbours:
+                augmentations.append(
+                    ParagraphAugment(
+                        given=neighbours,
+                        select=[
+                            ParagraphText(),
+                            ParagraphPosition(),
+                        ],
+                    )
+                )
+    return augmentations
+def merge_second_augment(item: AugmentRequest, response: AugmentResponse, augmented: Augmented):
+    """Merge in-place augmented data with an existing augment response."""
+    if any(
+        (
+            paragraph_augment.neighbours_before or paragraph_augment.neighbours_after
+            for paragraph_augment in item.paragraphs or []
+        )
+    ):
+        # neighbour paragraphs
+        new_paragraphs = {}
+        for paragraph_id_str, augmented_paragraph in response.paragraphs.items():
+            before_refs = []
+            for before_id_str in augmented_paragraph.neighbours_before or []:
+                before_id = ParagraphId.from_string(before_id_str)
+                if before_id not in augmented.paragraphs:
+                    continue
+                neighbour = augmented.paragraphs[before_id]
+                if before_id_str not in response.paragraphs:
+                    if not neighbour.text and not neighbour.position:
+                        continue
+                    # create a new paragraph for the neighbour
+                    new_paragraphs[before_id_str] = AugmentedParagraph(
+                        text=neighbour.text, position=neighbour.position
+                    )
+                else:
+                    # merge neighbour with existing paragraph
+                    if not response.paragraphs[before_id_str].text:
+                        response.paragraphs[before_id_str].text = neighbour.text
+                before_refs.append(before_id_str)
+            after_refs = []
+            for after_id_str in augmented_paragraph.neighbours_after or []:
+                after_id = ParagraphId.from_string(after_id_str)
+                if after_id not in augmented.paragraphs:
+                    continue
+                neighbour = augmented.paragraphs[after_id]
+                if after_id_str not in response.paragraphs:
+                    if not neighbour.text and not neighbour.position:
+                        continue
+                    # create a new paragraph for the neighbour
+                    new_paragraphs[after_id_str] = AugmentedParagraph(
+                        text=neighbour.text, position=neighbour.position
+                    )
+                else:
+                    # merge neighbour with existing paragraph
+                    if not response.paragraphs[after_id_str].text:
+                        response.paragraphs[after_id_str].text = neighbour.text
+                after_refs.append(after_id_str)
+            # update references to contain only the neighbours that existed in
+            # the response or we added
+            augmented_paragraph.neighbours_before = before_refs
+            augmented_paragraph.neighbours_after = after_refs
+        response.paragraphs.update(new_paragraphs)

nucliadb/search/api/v1/catalog.py CHANGED Viewed

@@ -19,12 +19,12 @@
 #
 import json
 from time import time
-from typing import Optional, Union
 from fastapi import Request, Response
 from fastapi_versioning import version
 from pydantic import ValidationError
+from nucliadb.common.catalog import catalog_facets, catalog_search
 from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
 from nucliadb.common.exceptions import InvalidQueryError
 from nucliadb.models.responses import HTTPClientError
@@ -33,7 +33,6 @@ from nucliadb.search.api.v1.router import KB_PREFIX, api
 from nucliadb.search.api.v1.utils import fastapi_query
 from nucliadb.search.search import cache
 from nucliadb.search.search.merge import fetch_resources
-from nucliadb.search.search.pgcatalog import pgcatalog_facets, pgcatalog_search
 from nucliadb.search.search.query_parser.parsers import parse_catalog
 from nucliadb.search.search.utils import (
     maybe_log_request_payload,
@@ -75,31 +74,28 @@ async def catalog_get(
     response: Response,
     kbid: str,
     query: str = fastapi_query(SearchParamDefaults.query),
-    filter_expression: Optional[str] = fastapi_query(SearchParamDefaults.catalog_filter_expression),
+    filter_expression: str | None = fastapi_query(SearchParamDefaults.catalog_filter_expression),
     filters: list[str] = fastapi_query(SearchParamDefaults.filters),
     faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
     sort_field: SortField = fastapi_query(SearchParamDefaults.sort_field),
-    sort_limit: Optional[int] = fastapi_query(SearchParamDefaults.sort_limit),
     sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
     page_number: int = fastapi_query(SearchParamDefaults.catalog_page_number),
     page_size: int = fastapi_query(SearchParamDefaults.catalog_page_size),
-    with_status: Optional[ResourceProcessingStatus] = fastapi_query(
+    with_status: ResourceProcessingStatus | None = fastapi_query(
         SearchParamDefaults.with_status, deprecated="Use filters instead"
     ),
     debug: bool = fastapi_query(SearchParamDefaults.debug, include_in_schema=False),
-    range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
-    range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
-    range_modification_start: Optional[DateTime] = fastapi_query(
+    range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
+    range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
+    range_modification_start: DateTime | None = fastapi_query(
         SearchParamDefaults.range_modification_start
     ),
-    range_modification_end: Optional[DateTime] = fastapi_query(
-        SearchParamDefaults.range_modification_end
-    ),
-    hidden: Optional[bool] = fastapi_query(SearchParamDefaults.hidden),
+    range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
+    hidden: bool | None = fastapi_query(SearchParamDefaults.hidden),
     show: list[ResourceProperties] = fastapi_query(
         SearchParamDefaults.show, default=[ResourceProperties.BASIC, ResourceProperties.ERRORS]
     ),
-) -> Union[CatalogResponse, HTTPClientError]:
+) -> CatalogResponse | HTTPClientError:
     try:
         expr = (
             CatalogFilterExpression.model_validate_json(filter_expression) if filter_expression else None
@@ -125,7 +121,7 @@ async def catalog_get(
         show=show,
     )
     if sort_field:
-        item.sort = SortOptions(field=sort_field, limit=sort_limit, order=sort_order)
+        item.sort = SortOptions(field=sort_field, order=sort_order)
     return await catalog(kbid, item)
@@ -144,14 +140,14 @@ async def catalog_post(
     request: Request,
     kbid: str,
     item: CatalogRequest,
-) -> Union[CatalogResponse, HTTPClientError]:
+) -> CatalogResponse | HTTPClientError:
     return await catalog(kbid, item)
 async def catalog(
     kbid: str,
     item: CatalogRequest,
-) -> Union[HTTPClientError, CatalogResponse]:
+) -> HTTPClientError | CatalogResponse:
     """
     Catalog endpoint is a simplified version of the search endpoint, it only
     returns bm25 results on titles and it does not support vector search.
@@ -164,7 +160,7 @@ async def catalog(
             query_parser = await parse_catalog(kbid, item)
             catalog_results = CatalogResponse()
-            catalog_results.fulltext = await pgcatalog_search(query_parser)
+            catalog_results.fulltext = await catalog_search(query_parser)
             catalog_results.resources = await fetch_resources(
                 resources=[r.rid for r in catalog_results.fulltext.results],
                 kbid=kbid,
@@ -205,7 +201,7 @@ async def catalog(
 )
 @requires(NucliaDBRoles.READER)
 @version(1)
-async def catalog_facets(
+async def catalog_facets_endpoint(
     request: Request, kbid: str, item: CatalogFacetsRequest
 ) -> CatalogFacetsResponse:
-    return CatalogFacetsResponse(facets=await pgcatalog_facets(kbid, item))
+    return CatalogFacetsResponse(facets=await catalog_facets(kbid, item))

nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl