PyPI - nucliadb - Versions diffs - 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl - Mend

nucliadb 6.2.0.post2675py3-none-any.whl → 6.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

migrations/0028_extracted_vectors_reference.py +61 -0
migrations/0029_backfill_field_status.py +149 -0
migrations/0030_label_deduplication.py +60 -0
nucliadb/common/cluster/manager.py +41 -331
nucliadb/common/cluster/rebalance.py +2 -2
nucliadb/common/cluster/rollover.py +12 -71
nucliadb/common/cluster/settings.py +3 -0
nucliadb/common/cluster/standalone/utils.py +0 -43
nucliadb/common/cluster/utils.py +0 -16
nucliadb/common/counters.py +1 -0
nucliadb/common/datamanagers/fields.py +48 -7
nucliadb/common/datamanagers/vectorsets.py +11 -2
nucliadb/common/external_index_providers/base.py +2 -1
nucliadb/common/external_index_providers/pinecone.py +3 -5
nucliadb/common/ids.py +18 -4
nucliadb/common/models_utils/from_proto.py +479 -0
nucliadb/common/models_utils/to_proto.py +60 -0
nucliadb/common/nidx.py +76 -37
nucliadb/export_import/models.py +3 -3
nucliadb/health.py +0 -7
nucliadb/ingest/app.py +0 -8
nucliadb/ingest/consumer/auditing.py +1 -1
nucliadb/ingest/consumer/shard_creator.py +1 -1
nucliadb/ingest/fields/base.py +83 -21
nucliadb/ingest/orm/brain.py +55 -56
nucliadb/ingest/orm/broker_message.py +12 -2
nucliadb/ingest/orm/entities.py +6 -17
nucliadb/ingest/orm/knowledgebox.py +44 -22
nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
nucliadb/ingest/orm/processor/processor.py +5 -2
nucliadb/ingest/orm/resource.py +222 -413
nucliadb/ingest/processing.py +8 -2
nucliadb/ingest/serialize.py +77 -46
nucliadb/ingest/service/writer.py +2 -56
nucliadb/ingest/settings.py +1 -4
nucliadb/learning_proxy.py +6 -4
nucliadb/purge/__init__.py +102 -12
nucliadb/purge/orphan_shards.py +6 -4
nucliadb/reader/api/models.py +3 -3
nucliadb/reader/api/v1/__init__.py +1 -0
nucliadb/reader/api/v1/download.py +2 -2
nucliadb/reader/api/v1/knowledgebox.py +3 -3
nucliadb/reader/api/v1/resource.py +23 -12
nucliadb/reader/api/v1/services.py +4 -4
nucliadb/reader/api/v1/vectorsets.py +48 -0
nucliadb/search/api/v1/ask.py +11 -1
nucliadb/search/api/v1/feedback.py +3 -3
nucliadb/search/api/v1/knowledgebox.py +8 -13
nucliadb/search/api/v1/search.py +3 -2
nucliadb/search/api/v1/suggest.py +0 -2
nucliadb/search/predict.py +6 -4
nucliadb/search/requesters/utils.py +1 -2
nucliadb/search/search/chat/ask.py +77 -13
nucliadb/search/search/chat/prompt.py +16 -5
nucliadb/search/search/chat/query.py +74 -34
nucliadb/search/search/exceptions.py +2 -7
nucliadb/search/search/find.py +9 -5
nucliadb/search/search/find_merge.py +10 -4
nucliadb/search/search/graph_strategy.py +884 -0
nucliadb/search/search/hydrator.py +6 -0
nucliadb/search/search/merge.py +79 -24
nucliadb/search/search/query.py +74 -245
nucliadb/search/search/query_parser/exceptions.py +11 -1
nucliadb/search/search/query_parser/fetcher.py +405 -0
nucliadb/search/search/query_parser/models.py +0 -3
nucliadb/search/search/query_parser/parser.py +22 -21
nucliadb/search/search/rerankers.py +1 -42
nucliadb/search/search/shards.py +19 -0
nucliadb/standalone/api_router.py +2 -14
nucliadb/standalone/settings.py +4 -0
nucliadb/train/generators/field_streaming.py +7 -3
nucliadb/train/lifecycle.py +3 -6
nucliadb/train/nodes.py +14 -12
nucliadb/train/resource.py +380 -0
nucliadb/writer/api/constants.py +20 -16
nucliadb/writer/api/v1/__init__.py +1 -0
nucliadb/writer/api/v1/export_import.py +1 -1
nucliadb/writer/api/v1/field.py +13 -7
nucliadb/writer/api/v1/knowledgebox.py +3 -46
nucliadb/writer/api/v1/resource.py +20 -13
nucliadb/writer/api/v1/services.py +10 -1
nucliadb/writer/api/v1/upload.py +61 -34
nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
nucliadb/writer/back_pressure.py +17 -46
nucliadb/writer/resource/basic.py +9 -7
nucliadb/writer/resource/field.py +42 -9
nucliadb/writer/settings.py +2 -2
nucliadb/writer/tus/gcs.py +11 -10
{nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
{nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
{nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
nucliadb/common/cluster/discovery/base.py +0 -178
nucliadb/common/cluster/discovery/k8s.py +0 -301
nucliadb/common/cluster/discovery/manual.py +0 -57
nucliadb/common/cluster/discovery/single.py +0 -51
nucliadb/common/cluster/discovery/types.py +0 -32
nucliadb/common/cluster/discovery/utils.py +0 -67
nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
nucliadb/common/cluster/standalone/index_node.py +0 -123
nucliadb/common/cluster/standalone/service.py +0 -84
nucliadb/standalone/introspect.py +0 -208
nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
/nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
{nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
{nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0

nucliadb/search/search/hydrator.py CHANGED Viewed

@@ -66,6 +66,9 @@ class TextBlockHydrationOptions(BaseModel):
     # list of exact matches to highlight
     ematches: Optional[list[str]] = None
+    # If true, only hydrate the text block if its text is not already populated
+    only_hydrate_empty: bool = False
 @hydrator_observer.wrap({"type": "resource_text"})
 async def hydrate_resource_text(
@@ -161,6 +164,8 @@ async def hydrate_text_block(
     `text_block` object.
     """
+    if options.only_hydrate_empty and text_block.text:
+        return text_block
     async with AsyncExitStack() as stack:
         if concurrency_control is not None:
             await stack.enter_async_context(concurrency_control)
@@ -188,4 +193,5 @@ def text_block_to_find_paragraph(text_block: TextBlockMatch) -> FindParagraph:
         reference=text_block.representation_file,
         page_with_visual=text_block.page_with_visual,
         position=text_block.position,
+        relevant_relations=text_block.relevant_relations,
     )

nucliadb/search/search/merge.py CHANGED Viewed

@@ -23,6 +23,8 @@ import math
 from typing import Any, Optional, Set, Union
 from nucliadb.common.ids import FieldId, ParagraphId
+from nucliadb.common.models_utils import from_proto
+from nucliadb.common.models_utils.from_proto import RelationTypePbMap
 from nucliadb.search.search import cache
 from nucliadb.search.search.cut import cut_page
 from nucliadb.search.search.fetch import (
@@ -33,11 +35,11 @@ from nucliadb.search.search.fetch import (
 )
 from nucliadb_models.common import FieldTypeName
 from nucliadb_models.labels import translate_system_to_alias_label
-from nucliadb_models.metadata import RelationTypePbMap
 from nucliadb_models.resource import ExtractedDataTypeName
 from nucliadb_models.search import (
     DirectionalRelation,
     EntitySubgraph,
+    EntityType,
     KnowledgeboxSearchResults,
     KnowledgeboxSuggestResults,
     MinScore,
@@ -46,7 +48,6 @@ from nucliadb_models.search import (
     RelatedEntities,
     RelatedEntity,
     RelationDirection,
-    RelationNodeTypeMap,
     Relations,
     ResourceProperties,
     ResourceResult,
@@ -71,6 +72,7 @@ from nucliadb_protos.nodereader_pb2 import (
     SuggestResponse,
     VectorSearchResponse,
 )
+from nucliadb_protos.utils_pb2 import RelationNode
 from .metrics import merge_observer
 from .paragraphs import get_paragraph_text, get_text_sentence
@@ -81,6 +83,15 @@ TitleScore = str
 SortValue = Union[Bm25Score, TimestampScore, TitleScore]
+def relation_node_type_to_entity_type(node_type: RelationNode.NodeType.ValueType) -> EntityType:
+    return {
+        RelationNode.NodeType.ENTITY: EntityType.ENTITY,
+        RelationNode.NodeType.LABEL: EntityType.LABEL,
+        RelationNode.NodeType.RESOURCE: EntityType.RESOURCE,
+        RelationNode.NodeType.USER: EntityType.USER,
+    }[node_type]
 def sort_results_by_score(results: Union[list[ParagraphResult], list[DocumentResult]]):
     results.sort(key=lambda x: (x.score.bm25, x.score.booster), reverse=True)
@@ -432,15 +443,38 @@ async def merge_paragraph_results(
 async def merge_relations_results(
     relations_responses: list[RelationSearchResponse],
     query: EntitiesSubgraphRequest,
+    only_with_metadata: bool = False,
+    only_agentic: bool = False,
 ) -> Relations:
     loop = asyncio.get_event_loop()
-    return await loop.run_in_executor(None, _merge_relations_results, relations_responses, query)
+    return await loop.run_in_executor(
+        None,
+        _merge_relations_results,
+        relations_responses,
+        query,
+        only_with_metadata,
+        only_agentic,
+    )
 def _merge_relations_results(
     relations_responses: list[RelationSearchResponse],
     query: EntitiesSubgraphRequest,
+    only_with_metadata: bool,
+    only_agentic: bool,
 ) -> Relations:
+    """
+    Merge relation search responses into a single Relations object while applying filters.
+    Args:
+        relations_responses: List of relation search responses
+        query: EntitiesSubgraphRequest object
+        only_with_metadata: If True, only include relations with metadata. This metadata includes paragraph_id and entity positions among other things.
+        only_agentic: If True, only include relations extracted by a Graph Extraction Agent.
+    Returns:
+        Relations
+    """
     relations = Relations(entities={})
     for entry_point in query.entry_points:
@@ -452,27 +486,37 @@ def _merge_relations_results(
             destination = relation.to
             relation_type = RelationTypePbMap[relation.relation]
             relation_label = relation.relation_label
-            if origin.value in relations.entities:
-                relations.entities[origin.value].related_to.append(
-                    DirectionalRelation(
-                        entity=destination.value,
-                        entity_type=RelationNodeTypeMap[destination.ntype],
-                        relation=relation_type,
-                        relation_label=relation_label,
-                        direction=RelationDirection.OUT,
+            metadata = relation.metadata if relation.HasField("metadata") else None
+            # If only_with_metadata is True, we check that metadata for the relation is not None
+            # If only_agentic is True, we check that metadata for the relation is not None and that it has a data_augmentation_task_id
+            # TODO: This is suboptimal, we should be able to filter this in the query to the index,
+            if (not only_with_metadata or metadata) and (
+                not only_agentic or (metadata and metadata.data_augmentation_task_id)
+            ):
+                if origin.value in relations.entities:
+                    relations.entities[origin.value].related_to.append(
+                        DirectionalRelation(
+                            entity=destination.value,
+                            entity_type=relation_node_type_to_entity_type(destination.ntype),
+                            entity_subtype=destination.subtype,
+                            relation=relation_type,
+                            relation_label=relation_label,
+                            direction=RelationDirection.OUT,
+                            metadata=from_proto.relation_metadata(metadata) if metadata else None,
+                        )
                     )
-                )
-            elif destination.value in relations.entities:
-                relations.entities[destination.value].related_to.append(
-                    DirectionalRelation(
-                        entity=origin.value,
-                        entity_type=RelationNodeTypeMap[origin.ntype],
-                        relation=relation_type,
-                        relation_label=relation_label,
-                        direction=RelationDirection.IN,
+                elif destination.value in relations.entities:
+                    relations.entities[destination.value].related_to.append(
+                        DirectionalRelation(
+                            entity=origin.value,
+                            entity_type=relation_node_type_to_entity_type(origin.ntype),
+                            entity_subtype=origin.subtype,
+                            relation=relation_type,
+                            relation_label=relation_label,
+                            direction=RelationDirection.IN,
+                            metadata=from_proto.relation_metadata(metadata) if metadata else None,
+                        )
                     )
-                )
     return relations
@@ -571,11 +615,22 @@ async def merge_suggest_entities_results(
     return RelatedEntities(entities=list(unique_entities), total=len(unique_entities))
+def merge_relation_prefix_results(
+    responses: list[SearchResponse],
+) -> RelatedEntities:
+    unique_entities: Set[RelatedEntity] = set()
+    for response in responses:
+        response_entities = (
+            RelatedEntity(family=e.subtype, value=e.value) for e in response.relation.prefix.nodes
+        )
+        unique_entities.update(response_entities)
+    return RelatedEntities(entities=list(unique_entities), total=len(unique_entities))
 async def merge_suggest_results(
     suggest_responses: list[SuggestResponse],
     kbid: str,
-    show: list[ResourceProperties],
-    field_type_filter: list[FieldTypeName],
     highlight: bool = False,
 ) -> KnowledgeboxSuggestResults:
     api_results = KnowledgeboxSuggestResults()

nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl

nucliadb 6.2.0.post2675py3-none-any.whl → 6.2.1py3-none-any.whl