nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. migrations/0028_extracted_vectors_reference.py +61 -0
  2. migrations/0029_backfill_field_status.py +149 -0
  3. migrations/0030_label_deduplication.py +60 -0
  4. nucliadb/common/cluster/manager.py +41 -331
  5. nucliadb/common/cluster/rebalance.py +2 -2
  6. nucliadb/common/cluster/rollover.py +12 -71
  7. nucliadb/common/cluster/settings.py +3 -0
  8. nucliadb/common/cluster/standalone/utils.py +0 -43
  9. nucliadb/common/cluster/utils.py +0 -16
  10. nucliadb/common/counters.py +1 -0
  11. nucliadb/common/datamanagers/fields.py +48 -7
  12. nucliadb/common/datamanagers/vectorsets.py +11 -2
  13. nucliadb/common/external_index_providers/base.py +2 -1
  14. nucliadb/common/external_index_providers/pinecone.py +3 -5
  15. nucliadb/common/ids.py +18 -4
  16. nucliadb/common/models_utils/from_proto.py +479 -0
  17. nucliadb/common/models_utils/to_proto.py +60 -0
  18. nucliadb/common/nidx.py +76 -37
  19. nucliadb/export_import/models.py +3 -3
  20. nucliadb/health.py +0 -7
  21. nucliadb/ingest/app.py +0 -8
  22. nucliadb/ingest/consumer/auditing.py +1 -1
  23. nucliadb/ingest/consumer/shard_creator.py +1 -1
  24. nucliadb/ingest/fields/base.py +83 -21
  25. nucliadb/ingest/orm/brain.py +55 -56
  26. nucliadb/ingest/orm/broker_message.py +12 -2
  27. nucliadb/ingest/orm/entities.py +6 -17
  28. nucliadb/ingest/orm/knowledgebox.py +44 -22
  29. nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
  30. nucliadb/ingest/orm/processor/processor.py +5 -2
  31. nucliadb/ingest/orm/resource.py +222 -413
  32. nucliadb/ingest/processing.py +8 -2
  33. nucliadb/ingest/serialize.py +77 -46
  34. nucliadb/ingest/service/writer.py +2 -56
  35. nucliadb/ingest/settings.py +1 -4
  36. nucliadb/learning_proxy.py +6 -4
  37. nucliadb/purge/__init__.py +102 -12
  38. nucliadb/purge/orphan_shards.py +6 -4
  39. nucliadb/reader/api/models.py +3 -3
  40. nucliadb/reader/api/v1/__init__.py +1 -0
  41. nucliadb/reader/api/v1/download.py +2 -2
  42. nucliadb/reader/api/v1/knowledgebox.py +3 -3
  43. nucliadb/reader/api/v1/resource.py +23 -12
  44. nucliadb/reader/api/v1/services.py +4 -4
  45. nucliadb/reader/api/v1/vectorsets.py +48 -0
  46. nucliadb/search/api/v1/ask.py +11 -1
  47. nucliadb/search/api/v1/feedback.py +3 -3
  48. nucliadb/search/api/v1/knowledgebox.py +8 -13
  49. nucliadb/search/api/v1/search.py +3 -2
  50. nucliadb/search/api/v1/suggest.py +0 -2
  51. nucliadb/search/predict.py +6 -4
  52. nucliadb/search/requesters/utils.py +1 -2
  53. nucliadb/search/search/chat/ask.py +77 -13
  54. nucliadb/search/search/chat/prompt.py +16 -5
  55. nucliadb/search/search/chat/query.py +74 -34
  56. nucliadb/search/search/exceptions.py +2 -7
  57. nucliadb/search/search/find.py +9 -5
  58. nucliadb/search/search/find_merge.py +10 -4
  59. nucliadb/search/search/graph_strategy.py +884 -0
  60. nucliadb/search/search/hydrator.py +6 -0
  61. nucliadb/search/search/merge.py +79 -24
  62. nucliadb/search/search/query.py +74 -245
  63. nucliadb/search/search/query_parser/exceptions.py +11 -1
  64. nucliadb/search/search/query_parser/fetcher.py +405 -0
  65. nucliadb/search/search/query_parser/models.py +0 -3
  66. nucliadb/search/search/query_parser/parser.py +22 -21
  67. nucliadb/search/search/rerankers.py +1 -42
  68. nucliadb/search/search/shards.py +19 -0
  69. nucliadb/standalone/api_router.py +2 -14
  70. nucliadb/standalone/settings.py +4 -0
  71. nucliadb/train/generators/field_streaming.py +7 -3
  72. nucliadb/train/lifecycle.py +3 -6
  73. nucliadb/train/nodes.py +14 -12
  74. nucliadb/train/resource.py +380 -0
  75. nucliadb/writer/api/constants.py +20 -16
  76. nucliadb/writer/api/v1/__init__.py +1 -0
  77. nucliadb/writer/api/v1/export_import.py +1 -1
  78. nucliadb/writer/api/v1/field.py +13 -7
  79. nucliadb/writer/api/v1/knowledgebox.py +3 -46
  80. nucliadb/writer/api/v1/resource.py +20 -13
  81. nucliadb/writer/api/v1/services.py +10 -1
  82. nucliadb/writer/api/v1/upload.py +61 -34
  83. nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
  84. nucliadb/writer/back_pressure.py +17 -46
  85. nucliadb/writer/resource/basic.py +9 -7
  86. nucliadb/writer/resource/field.py +42 -9
  87. nucliadb/writer/settings.py +2 -2
  88. nucliadb/writer/tus/gcs.py +11 -10
  89. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
  90. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
  91. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
  92. nucliadb/common/cluster/discovery/base.py +0 -178
  93. nucliadb/common/cluster/discovery/k8s.py +0 -301
  94. nucliadb/common/cluster/discovery/manual.py +0 -57
  95. nucliadb/common/cluster/discovery/single.py +0 -51
  96. nucliadb/common/cluster/discovery/types.py +0 -32
  97. nucliadb/common/cluster/discovery/utils.py +0 -67
  98. nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
  99. nucliadb/common/cluster/standalone/index_node.py +0 -123
  100. nucliadb/common/cluster/standalone/service.py +0 -84
  101. nucliadb/standalone/introspect.py +0 -208
  102. nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
  103. /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
  104. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
  105. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -66,6 +66,9 @@ class TextBlockHydrationOptions(BaseModel):
66
66
  # list of exact matches to highlight
67
67
  ematches: Optional[list[str]] = None
68
68
 
69
+ # If true, only hydrate the text block if its text is not already populated
70
+ only_hydrate_empty: bool = False
71
+
69
72
 
70
73
  @hydrator_observer.wrap({"type": "resource_text"})
71
74
  async def hydrate_resource_text(
@@ -161,6 +164,8 @@ async def hydrate_text_block(
161
164
  `text_block` object.
162
165
 
163
166
  """
167
+ if options.only_hydrate_empty and text_block.text:
168
+ return text_block
164
169
  async with AsyncExitStack() as stack:
165
170
  if concurrency_control is not None:
166
171
  await stack.enter_async_context(concurrency_control)
@@ -188,4 +193,5 @@ def text_block_to_find_paragraph(text_block: TextBlockMatch) -> FindParagraph:
188
193
  reference=text_block.representation_file,
189
194
  page_with_visual=text_block.page_with_visual,
190
195
  position=text_block.position,
196
+ relevant_relations=text_block.relevant_relations,
191
197
  )
@@ -23,6 +23,8 @@ import math
23
23
  from typing import Any, Optional, Set, Union
24
24
 
25
25
  from nucliadb.common.ids import FieldId, ParagraphId
26
+ from nucliadb.common.models_utils import from_proto
27
+ from nucliadb.common.models_utils.from_proto import RelationTypePbMap
26
28
  from nucliadb.search.search import cache
27
29
  from nucliadb.search.search.cut import cut_page
28
30
  from nucliadb.search.search.fetch import (
@@ -33,11 +35,11 @@ from nucliadb.search.search.fetch import (
33
35
  )
34
36
  from nucliadb_models.common import FieldTypeName
35
37
  from nucliadb_models.labels import translate_system_to_alias_label
36
- from nucliadb_models.metadata import RelationTypePbMap
37
38
  from nucliadb_models.resource import ExtractedDataTypeName
38
39
  from nucliadb_models.search import (
39
40
  DirectionalRelation,
40
41
  EntitySubgraph,
42
+ EntityType,
41
43
  KnowledgeboxSearchResults,
42
44
  KnowledgeboxSuggestResults,
43
45
  MinScore,
@@ -46,7 +48,6 @@ from nucliadb_models.search import (
46
48
  RelatedEntities,
47
49
  RelatedEntity,
48
50
  RelationDirection,
49
- RelationNodeTypeMap,
50
51
  Relations,
51
52
  ResourceProperties,
52
53
  ResourceResult,
@@ -71,6 +72,7 @@ from nucliadb_protos.nodereader_pb2 import (
71
72
  SuggestResponse,
72
73
  VectorSearchResponse,
73
74
  )
75
+ from nucliadb_protos.utils_pb2 import RelationNode
74
76
 
75
77
  from .metrics import merge_observer
76
78
  from .paragraphs import get_paragraph_text, get_text_sentence
@@ -81,6 +83,15 @@ TitleScore = str
81
83
  SortValue = Union[Bm25Score, TimestampScore, TitleScore]
82
84
 
83
85
 
86
+ def relation_node_type_to_entity_type(node_type: RelationNode.NodeType.ValueType) -> EntityType:
87
+ return {
88
+ RelationNode.NodeType.ENTITY: EntityType.ENTITY,
89
+ RelationNode.NodeType.LABEL: EntityType.LABEL,
90
+ RelationNode.NodeType.RESOURCE: EntityType.RESOURCE,
91
+ RelationNode.NodeType.USER: EntityType.USER,
92
+ }[node_type]
93
+
94
+
84
95
  def sort_results_by_score(results: Union[list[ParagraphResult], list[DocumentResult]]):
85
96
  results.sort(key=lambda x: (x.score.bm25, x.score.booster), reverse=True)
86
97
 
@@ -432,15 +443,38 @@ async def merge_paragraph_results(
432
443
  async def merge_relations_results(
433
444
  relations_responses: list[RelationSearchResponse],
434
445
  query: EntitiesSubgraphRequest,
446
+ only_with_metadata: bool = False,
447
+ only_agentic: bool = False,
435
448
  ) -> Relations:
436
449
  loop = asyncio.get_event_loop()
437
- return await loop.run_in_executor(None, _merge_relations_results, relations_responses, query)
450
+ return await loop.run_in_executor(
451
+ None,
452
+ _merge_relations_results,
453
+ relations_responses,
454
+ query,
455
+ only_with_metadata,
456
+ only_agentic,
457
+ )
438
458
 
439
459
 
440
460
  def _merge_relations_results(
441
461
  relations_responses: list[RelationSearchResponse],
442
462
  query: EntitiesSubgraphRequest,
463
+ only_with_metadata: bool,
464
+ only_agentic: bool,
443
465
  ) -> Relations:
466
+ """
467
+ Merge relation search responses into a single Relations object while applying filters.
468
+
469
+ Args:
470
+ relations_responses: List of relation search responses
471
+ query: EntitiesSubgraphRequest object
472
+ only_with_metadata: If True, only include relations with metadata. This metadata includes paragraph_id and entity positions among other things.
473
+ only_agentic: If True, only include relations extracted by a Graph Extraction Agent.
474
+
475
+ Returns:
476
+ Relations
477
+ """
444
478
  relations = Relations(entities={})
445
479
 
446
480
  for entry_point in query.entry_points:
@@ -452,27 +486,37 @@ def _merge_relations_results(
452
486
  destination = relation.to
453
487
  relation_type = RelationTypePbMap[relation.relation]
454
488
  relation_label = relation.relation_label
455
-
456
- if origin.value in relations.entities:
457
- relations.entities[origin.value].related_to.append(
458
- DirectionalRelation(
459
- entity=destination.value,
460
- entity_type=RelationNodeTypeMap[destination.ntype],
461
- relation=relation_type,
462
- relation_label=relation_label,
463
- direction=RelationDirection.OUT,
489
+ metadata = relation.metadata if relation.HasField("metadata") else None
490
+ # If only_with_metadata is True, we check that metadata for the relation is not None
491
+ # If only_agentic is True, we check that metadata for the relation is not None and that it has a data_augmentation_task_id
492
+ # TODO: This is suboptimal, we should be able to filter this in the query to the index,
493
+ if (not only_with_metadata or metadata) and (
494
+ not only_agentic or (metadata and metadata.data_augmentation_task_id)
495
+ ):
496
+ if origin.value in relations.entities:
497
+ relations.entities[origin.value].related_to.append(
498
+ DirectionalRelation(
499
+ entity=destination.value,
500
+ entity_type=relation_node_type_to_entity_type(destination.ntype),
501
+ entity_subtype=destination.subtype,
502
+ relation=relation_type,
503
+ relation_label=relation_label,
504
+ direction=RelationDirection.OUT,
505
+ metadata=from_proto.relation_metadata(metadata) if metadata else None,
506
+ )
464
507
  )
465
- )
466
- elif destination.value in relations.entities:
467
- relations.entities[destination.value].related_to.append(
468
- DirectionalRelation(
469
- entity=origin.value,
470
- entity_type=RelationNodeTypeMap[origin.ntype],
471
- relation=relation_type,
472
- relation_label=relation_label,
473
- direction=RelationDirection.IN,
508
+ elif destination.value in relations.entities:
509
+ relations.entities[destination.value].related_to.append(
510
+ DirectionalRelation(
511
+ entity=origin.value,
512
+ entity_type=relation_node_type_to_entity_type(origin.ntype),
513
+ entity_subtype=origin.subtype,
514
+ relation=relation_type,
515
+ relation_label=relation_label,
516
+ direction=RelationDirection.IN,
517
+ metadata=from_proto.relation_metadata(metadata) if metadata else None,
518
+ )
474
519
  )
475
- )
476
520
 
477
521
  return relations
478
522
 
@@ -571,11 +615,22 @@ async def merge_suggest_entities_results(
571
615
  return RelatedEntities(entities=list(unique_entities), total=len(unique_entities))
572
616
 
573
617
 
618
+ def merge_relation_prefix_results(
619
+ responses: list[SearchResponse],
620
+ ) -> RelatedEntities:
621
+ unique_entities: Set[RelatedEntity] = set()
622
+ for response in responses:
623
+ response_entities = (
624
+ RelatedEntity(family=e.subtype, value=e.value) for e in response.relation.prefix.nodes
625
+ )
626
+ unique_entities.update(response_entities)
627
+
628
+ return RelatedEntities(entities=list(unique_entities), total=len(unique_entities))
629
+
630
+
574
631
  async def merge_suggest_results(
575
632
  suggest_responses: list[SuggestResponse],
576
633
  kbid: str,
577
- show: list[ResourceProperties],
578
- field_type_filter: list[FieldTypeName],
579
634
  highlight: bool = False,
580
635
  ) -> KnowledgeboxSuggestResults:
581
636
  api_results = KnowledgeboxSuggestResults()