nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0028_extracted_vectors_reference.py +61 -0
- migrations/0029_backfill_field_status.py +149 -0
- migrations/0030_label_deduplication.py +60 -0
- nucliadb/common/cluster/manager.py +41 -331
- nucliadb/common/cluster/rebalance.py +2 -2
- nucliadb/common/cluster/rollover.py +12 -71
- nucliadb/common/cluster/settings.py +3 -0
- nucliadb/common/cluster/standalone/utils.py +0 -43
- nucliadb/common/cluster/utils.py +0 -16
- nucliadb/common/counters.py +1 -0
- nucliadb/common/datamanagers/fields.py +48 -7
- nucliadb/common/datamanagers/vectorsets.py +11 -2
- nucliadb/common/external_index_providers/base.py +2 -1
- nucliadb/common/external_index_providers/pinecone.py +3 -5
- nucliadb/common/ids.py +18 -4
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +76 -37
- nucliadb/export_import/models.py +3 -3
- nucliadb/health.py +0 -7
- nucliadb/ingest/app.py +0 -8
- nucliadb/ingest/consumer/auditing.py +1 -1
- nucliadb/ingest/consumer/shard_creator.py +1 -1
- nucliadb/ingest/fields/base.py +83 -21
- nucliadb/ingest/orm/brain.py +55 -56
- nucliadb/ingest/orm/broker_message.py +12 -2
- nucliadb/ingest/orm/entities.py +6 -17
- nucliadb/ingest/orm/knowledgebox.py +44 -22
- nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
- nucliadb/ingest/orm/processor/processor.py +5 -2
- nucliadb/ingest/orm/resource.py +222 -413
- nucliadb/ingest/processing.py +8 -2
- nucliadb/ingest/serialize.py +77 -46
- nucliadb/ingest/service/writer.py +2 -56
- nucliadb/ingest/settings.py +1 -4
- nucliadb/learning_proxy.py +6 -4
- nucliadb/purge/__init__.py +102 -12
- nucliadb/purge/orphan_shards.py +6 -4
- nucliadb/reader/api/models.py +3 -3
- nucliadb/reader/api/v1/__init__.py +1 -0
- nucliadb/reader/api/v1/download.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +3 -3
- nucliadb/reader/api/v1/resource.py +23 -12
- nucliadb/reader/api/v1/services.py +4 -4
- nucliadb/reader/api/v1/vectorsets.py +48 -0
- nucliadb/search/api/v1/ask.py +11 -1
- nucliadb/search/api/v1/feedback.py +3 -3
- nucliadb/search/api/v1/knowledgebox.py +8 -13
- nucliadb/search/api/v1/search.py +3 -2
- nucliadb/search/api/v1/suggest.py +0 -2
- nucliadb/search/predict.py +6 -4
- nucliadb/search/requesters/utils.py +1 -2
- nucliadb/search/search/chat/ask.py +77 -13
- nucliadb/search/search/chat/prompt.py +16 -5
- nucliadb/search/search/chat/query.py +74 -34
- nucliadb/search/search/exceptions.py +2 -7
- nucliadb/search/search/find.py +9 -5
- nucliadb/search/search/find_merge.py +10 -4
- nucliadb/search/search/graph_strategy.py +884 -0
- nucliadb/search/search/hydrator.py +6 -0
- nucliadb/search/search/merge.py +79 -24
- nucliadb/search/search/query.py +74 -245
- nucliadb/search/search/query_parser/exceptions.py +11 -1
- nucliadb/search/search/query_parser/fetcher.py +405 -0
- nucliadb/search/search/query_parser/models.py +0 -3
- nucliadb/search/search/query_parser/parser.py +22 -21
- nucliadb/search/search/rerankers.py +1 -42
- nucliadb/search/search/shards.py +19 -0
- nucliadb/standalone/api_router.py +2 -14
- nucliadb/standalone/settings.py +4 -0
- nucliadb/train/generators/field_streaming.py +7 -3
- nucliadb/train/lifecycle.py +3 -6
- nucliadb/train/nodes.py +14 -12
- nucliadb/train/resource.py +380 -0
- nucliadb/writer/api/constants.py +20 -16
- nucliadb/writer/api/v1/__init__.py +1 -0
- nucliadb/writer/api/v1/export_import.py +1 -1
- nucliadb/writer/api/v1/field.py +13 -7
- nucliadb/writer/api/v1/knowledgebox.py +3 -46
- nucliadb/writer/api/v1/resource.py +20 -13
- nucliadb/writer/api/v1/services.py +10 -1
- nucliadb/writer/api/v1/upload.py +61 -34
- nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
- nucliadb/writer/back_pressure.py +17 -46
- nucliadb/writer/resource/basic.py +9 -7
- nucliadb/writer/resource/field.py +42 -9
- nucliadb/writer/settings.py +2 -2
- nucliadb/writer/tus/gcs.py +11 -10
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
- nucliadb/common/cluster/discovery/base.py +0 -178
- nucliadb/common/cluster/discovery/k8s.py +0 -301
- nucliadb/common/cluster/discovery/manual.py +0 -57
- nucliadb/common/cluster/discovery/single.py +0 -51
- nucliadb/common/cluster/discovery/types.py +0 -32
- nucliadb/common/cluster/discovery/utils.py +0 -67
- nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
- nucliadb/common/cluster/standalone/index_node.py +0 -123
- nucliadb/common/cluster/standalone/service.py +0 -84
- nucliadb/standalone/introspect.py +0 -208
- nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
- /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -66,6 +66,9 @@ class TextBlockHydrationOptions(BaseModel):
|
|
66
66
|
# list of exact matches to highlight
|
67
67
|
ematches: Optional[list[str]] = None
|
68
68
|
|
69
|
+
# If true, only hydrate the text block if its text is not already populated
|
70
|
+
only_hydrate_empty: bool = False
|
71
|
+
|
69
72
|
|
70
73
|
@hydrator_observer.wrap({"type": "resource_text"})
|
71
74
|
async def hydrate_resource_text(
|
@@ -161,6 +164,8 @@ async def hydrate_text_block(
|
|
161
164
|
`text_block` object.
|
162
165
|
|
163
166
|
"""
|
167
|
+
if options.only_hydrate_empty and text_block.text:
|
168
|
+
return text_block
|
164
169
|
async with AsyncExitStack() as stack:
|
165
170
|
if concurrency_control is not None:
|
166
171
|
await stack.enter_async_context(concurrency_control)
|
@@ -188,4 +193,5 @@ def text_block_to_find_paragraph(text_block: TextBlockMatch) -> FindParagraph:
|
|
188
193
|
reference=text_block.representation_file,
|
189
194
|
page_with_visual=text_block.page_with_visual,
|
190
195
|
position=text_block.position,
|
196
|
+
relevant_relations=text_block.relevant_relations,
|
191
197
|
)
|
nucliadb/search/search/merge.py
CHANGED
@@ -23,6 +23,8 @@ import math
|
|
23
23
|
from typing import Any, Optional, Set, Union
|
24
24
|
|
25
25
|
from nucliadb.common.ids import FieldId, ParagraphId
|
26
|
+
from nucliadb.common.models_utils import from_proto
|
27
|
+
from nucliadb.common.models_utils.from_proto import RelationTypePbMap
|
26
28
|
from nucliadb.search.search import cache
|
27
29
|
from nucliadb.search.search.cut import cut_page
|
28
30
|
from nucliadb.search.search.fetch import (
|
@@ -33,11 +35,11 @@ from nucliadb.search.search.fetch import (
|
|
33
35
|
)
|
34
36
|
from nucliadb_models.common import FieldTypeName
|
35
37
|
from nucliadb_models.labels import translate_system_to_alias_label
|
36
|
-
from nucliadb_models.metadata import RelationTypePbMap
|
37
38
|
from nucliadb_models.resource import ExtractedDataTypeName
|
38
39
|
from nucliadb_models.search import (
|
39
40
|
DirectionalRelation,
|
40
41
|
EntitySubgraph,
|
42
|
+
EntityType,
|
41
43
|
KnowledgeboxSearchResults,
|
42
44
|
KnowledgeboxSuggestResults,
|
43
45
|
MinScore,
|
@@ -46,7 +48,6 @@ from nucliadb_models.search import (
|
|
46
48
|
RelatedEntities,
|
47
49
|
RelatedEntity,
|
48
50
|
RelationDirection,
|
49
|
-
RelationNodeTypeMap,
|
50
51
|
Relations,
|
51
52
|
ResourceProperties,
|
52
53
|
ResourceResult,
|
@@ -71,6 +72,7 @@ from nucliadb_protos.nodereader_pb2 import (
|
|
71
72
|
SuggestResponse,
|
72
73
|
VectorSearchResponse,
|
73
74
|
)
|
75
|
+
from nucliadb_protos.utils_pb2 import RelationNode
|
74
76
|
|
75
77
|
from .metrics import merge_observer
|
76
78
|
from .paragraphs import get_paragraph_text, get_text_sentence
|
@@ -81,6 +83,15 @@ TitleScore = str
|
|
81
83
|
SortValue = Union[Bm25Score, TimestampScore, TitleScore]
|
82
84
|
|
83
85
|
|
86
|
+
def relation_node_type_to_entity_type(node_type: RelationNode.NodeType.ValueType) -> EntityType:
|
87
|
+
return {
|
88
|
+
RelationNode.NodeType.ENTITY: EntityType.ENTITY,
|
89
|
+
RelationNode.NodeType.LABEL: EntityType.LABEL,
|
90
|
+
RelationNode.NodeType.RESOURCE: EntityType.RESOURCE,
|
91
|
+
RelationNode.NodeType.USER: EntityType.USER,
|
92
|
+
}[node_type]
|
93
|
+
|
94
|
+
|
84
95
|
def sort_results_by_score(results: Union[list[ParagraphResult], list[DocumentResult]]):
|
85
96
|
results.sort(key=lambda x: (x.score.bm25, x.score.booster), reverse=True)
|
86
97
|
|
@@ -432,15 +443,38 @@ async def merge_paragraph_results(
|
|
432
443
|
async def merge_relations_results(
|
433
444
|
relations_responses: list[RelationSearchResponse],
|
434
445
|
query: EntitiesSubgraphRequest,
|
446
|
+
only_with_metadata: bool = False,
|
447
|
+
only_agentic: bool = False,
|
435
448
|
) -> Relations:
|
436
449
|
loop = asyncio.get_event_loop()
|
437
|
-
return await loop.run_in_executor(
|
450
|
+
return await loop.run_in_executor(
|
451
|
+
None,
|
452
|
+
_merge_relations_results,
|
453
|
+
relations_responses,
|
454
|
+
query,
|
455
|
+
only_with_metadata,
|
456
|
+
only_agentic,
|
457
|
+
)
|
438
458
|
|
439
459
|
|
440
460
|
def _merge_relations_results(
|
441
461
|
relations_responses: list[RelationSearchResponse],
|
442
462
|
query: EntitiesSubgraphRequest,
|
463
|
+
only_with_metadata: bool,
|
464
|
+
only_agentic: bool,
|
443
465
|
) -> Relations:
|
466
|
+
"""
|
467
|
+
Merge relation search responses into a single Relations object while applying filters.
|
468
|
+
|
469
|
+
Args:
|
470
|
+
relations_responses: List of relation search responses
|
471
|
+
query: EntitiesSubgraphRequest object
|
472
|
+
only_with_metadata: If True, only include relations with metadata. This metadata includes paragraph_id and entity positions among other things.
|
473
|
+
only_agentic: If True, only include relations extracted by a Graph Extraction Agent.
|
474
|
+
|
475
|
+
Returns:
|
476
|
+
Relations
|
477
|
+
"""
|
444
478
|
relations = Relations(entities={})
|
445
479
|
|
446
480
|
for entry_point in query.entry_points:
|
@@ -452,27 +486,37 @@ def _merge_relations_results(
|
|
452
486
|
destination = relation.to
|
453
487
|
relation_type = RelationTypePbMap[relation.relation]
|
454
488
|
relation_label = relation.relation_label
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
489
|
+
metadata = relation.metadata if relation.HasField("metadata") else None
|
490
|
+
# If only_with_metadata is True, we check that metadata for the relation is not None
|
491
|
+
# If only_agentic is True, we check that metadata for the relation is not None and that it has a data_augmentation_task_id
|
492
|
+
# TODO: This is suboptimal, we should be able to filter this in the query to the index,
|
493
|
+
if (not only_with_metadata or metadata) and (
|
494
|
+
not only_agentic or (metadata and metadata.data_augmentation_task_id)
|
495
|
+
):
|
496
|
+
if origin.value in relations.entities:
|
497
|
+
relations.entities[origin.value].related_to.append(
|
498
|
+
DirectionalRelation(
|
499
|
+
entity=destination.value,
|
500
|
+
entity_type=relation_node_type_to_entity_type(destination.ntype),
|
501
|
+
entity_subtype=destination.subtype,
|
502
|
+
relation=relation_type,
|
503
|
+
relation_label=relation_label,
|
504
|
+
direction=RelationDirection.OUT,
|
505
|
+
metadata=from_proto.relation_metadata(metadata) if metadata else None,
|
506
|
+
)
|
464
507
|
)
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
508
|
+
elif destination.value in relations.entities:
|
509
|
+
relations.entities[destination.value].related_to.append(
|
510
|
+
DirectionalRelation(
|
511
|
+
entity=origin.value,
|
512
|
+
entity_type=relation_node_type_to_entity_type(origin.ntype),
|
513
|
+
entity_subtype=origin.subtype,
|
514
|
+
relation=relation_type,
|
515
|
+
relation_label=relation_label,
|
516
|
+
direction=RelationDirection.IN,
|
517
|
+
metadata=from_proto.relation_metadata(metadata) if metadata else None,
|
518
|
+
)
|
474
519
|
)
|
475
|
-
)
|
476
520
|
|
477
521
|
return relations
|
478
522
|
|
@@ -571,11 +615,22 @@ async def merge_suggest_entities_results(
|
|
571
615
|
return RelatedEntities(entities=list(unique_entities), total=len(unique_entities))
|
572
616
|
|
573
617
|
|
618
|
+
def merge_relation_prefix_results(
|
619
|
+
responses: list[SearchResponse],
|
620
|
+
) -> RelatedEntities:
|
621
|
+
unique_entities: Set[RelatedEntity] = set()
|
622
|
+
for response in responses:
|
623
|
+
response_entities = (
|
624
|
+
RelatedEntity(family=e.subtype, value=e.value) for e in response.relation.prefix.nodes
|
625
|
+
)
|
626
|
+
unique_entities.update(response_entities)
|
627
|
+
|
628
|
+
return RelatedEntities(entities=list(unique_entities), total=len(unique_entities))
|
629
|
+
|
630
|
+
|
574
631
|
async def merge_suggest_results(
|
575
632
|
suggest_responses: list[SuggestResponse],
|
576
633
|
kbid: str,
|
577
|
-
show: list[ResourceProperties],
|
578
|
-
field_type_filter: list[FieldTypeName],
|
579
634
|
highlight: bool = False,
|
580
635
|
) -> KnowledgeboxSuggestResults:
|
581
636
|
api_results = KnowledgeboxSuggestResults()
|