nucliadb 6.3.7.post4066__py3-none-any.whl → 6.3.7.post4071__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,7 +20,7 @@
20
20
  import asyncio
21
21
  import datetime
22
22
  import math
23
- from typing import Any, Optional, Set, Union
23
+ from typing import Any, Iterable, Optional, Set, Union
24
24
 
25
25
  from nucliadb.common.ids import FieldId, ParagraphId
26
26
  from nucliadb.common.models_utils import from_proto
@@ -33,6 +33,7 @@ from nucliadb.search.search.fetch import (
33
33
  get_labels_resource,
34
34
  get_seconds_paragraph,
35
35
  )
36
+ from nucliadb.search.search.query_parser.models import FulltextQuery, UnitRetrieval
36
37
  from nucliadb_models.common import FieldTypeName
37
38
  from nucliadb_models.labels import translate_system_to_alias_label
38
39
  from nucliadb_models.metadata import RelationType
@@ -43,7 +44,6 @@ from nucliadb_models.search import (
43
44
  EntityType,
44
45
  KnowledgeboxSearchResults,
45
46
  KnowledgeboxSuggestResults,
46
- MinScore,
47
47
  Paragraph,
48
48
  Paragraphs,
49
49
  RelatedEntities,
@@ -65,7 +65,6 @@ from nucliadb_protos.nodereader_pb2 import (
65
65
  DocumentResult,
66
66
  DocumentScored,
67
67
  DocumentSearchResponse,
68
- EntitiesSubgraphRequest,
69
68
  ParagraphResult,
70
69
  ParagraphSearchResponse,
71
70
  RelationSearchResponse,
@@ -129,21 +128,17 @@ async def get_sort_value(
129
128
 
130
129
 
131
130
  async def merge_documents_results(
132
- document_responses: list[DocumentSearchResponse],
133
- resources: list[str],
134
- top_k: int,
135
131
  kbid: str,
136
- sort: SortOptions,
137
- min_score: float,
138
- ) -> Resources:
132
+ responses: list[DocumentSearchResponse],
133
+ *,
134
+ query: FulltextQuery,
135
+ top_k: int,
136
+ ) -> tuple[Resources, list[str]]:
139
137
  raw_resource_list: list[tuple[DocumentResult, SortValue]] = []
140
138
  facets: dict[str, Any] = {}
141
- query = None
142
139
  total = 0
143
140
  next_page = False
144
- for document_response in document_responses:
145
- if query is None:
146
- query = document_response.query
141
+ for document_response in responses:
147
142
  if document_response.facets:
148
143
  for key, value in document_response.facets.items():
149
144
  key = translate_system_to_alias_label(key)
@@ -155,7 +150,7 @@ async def merge_documents_results(
155
150
  if document_response.next_page:
156
151
  next_page = True
157
152
  for result in document_response.results:
158
- sort_value = await get_sort_value(result, sort.field, kbid)
153
+ sort_value = await get_sort_value(result, query.order_by, kbid)
159
154
  if sort_value is not None:
160
155
  raw_resource_list.append((result, sort_value))
161
156
  total += document_response.total
@@ -163,8 +158,9 @@ async def merge_documents_results(
163
158
  # We need to cut first and then sort, otherwise the page will be wrong if the order is DESC
164
159
  raw_resource_list, has_more = cut_page(raw_resource_list, top_k)
165
160
  next_page = next_page or has_more
166
- raw_resource_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
161
+ raw_resource_list.sort(key=lambda x: x[1], reverse=(query.sort == SortOrder.DESC))
167
162
 
163
+ result_resource_ids = []
168
164
  result_resource_list: list[ResourceResult] = []
169
165
  for result, _ in raw_resource_list:
170
166
  labels = await get_labels_resource(result, kbid)
@@ -179,26 +175,26 @@ async def merge_documents_results(
179
175
  labels=labels,
180
176
  )
181
177
  )
182
- if result.uuid not in resources:
183
- resources.append(result.uuid)
178
+ if result.uuid not in result_resource_ids:
179
+ result_resource_ids.append(result.uuid)
184
180
 
185
181
  return Resources(
186
182
  facets=facets,
187
183
  results=result_resource_list,
188
- query=query,
184
+ query=query.query,
189
185
  total=total,
190
186
  page_number=0, # Bw/c with pagination
191
187
  page_size=top_k,
192
188
  next_page=next_page,
193
- min_score=min_score,
194
- )
189
+ min_score=query.min_score,
190
+ ), result_resource_ids
195
191
 
196
192
 
197
193
  async def merge_suggest_paragraph_results(
198
194
  suggest_responses: list[SuggestResponse],
199
195
  kbid: str,
200
196
  highlight: bool,
201
- ):
197
+ ) -> Paragraphs:
202
198
  raw_paragraph_list: list[ParagraphResult] = []
203
199
  query = None
204
200
  ematches = None
@@ -266,7 +262,7 @@ async def merge_vectors_results(
266
262
  kbid: str,
267
263
  top_k: int,
268
264
  min_score: Optional[float] = None,
269
- ):
265
+ ) -> Sentences:
270
266
  facets: dict[str, Any] = {}
271
267
  raw_vectors_list: list[DocumentScored] = []
272
268
 
@@ -339,14 +335,13 @@ async def merge_vectors_results(
339
335
 
340
336
 
341
337
  async def merge_paragraph_results(
342
- paragraph_responses: list[ParagraphSearchResponse],
343
- resources: list[str],
344
338
  kbid: str,
339
+ paragraph_responses: list[ParagraphSearchResponse],
345
340
  top_k: int,
346
341
  highlight: bool,
347
342
  sort: SortOptions,
348
343
  min_score: float,
349
- ) -> Paragraphs:
344
+ ) -> tuple[Paragraphs, list[str]]:
350
345
  raw_paragraph_list: list[tuple[ParagraphResult, SortValue]] = []
351
346
  facets: dict[str, Any] = {}
352
347
  query = None
@@ -379,6 +374,7 @@ async def merge_paragraph_results(
379
374
  raw_paragraph_list, has_more = cut_page(raw_paragraph_list, top_k)
380
375
  next_page = next_page or has_more
381
376
 
377
+ result_resource_ids = []
382
378
  result_paragraph_list: list[Paragraph] = []
383
379
  for result, _ in raw_paragraph_list:
384
380
  _, field_type, field = result.field.split("/")
@@ -426,8 +422,8 @@ async def merge_paragraph_results(
426
422
  new_paragraph.end_seconds = seconds_positions[1]
427
423
 
428
424
  result_paragraph_list.append(new_paragraph)
429
- if new_paragraph.rid not in resources:
430
- resources.append(new_paragraph.rid)
425
+ if new_paragraph.rid not in result_resource_ids:
426
+ result_resource_ids.append(new_paragraph.rid)
431
427
  return Paragraphs(
432
428
  results=result_paragraph_list,
433
429
  facets=facets,
@@ -437,13 +433,13 @@ async def merge_paragraph_results(
437
433
  page_size=top_k,
438
434
  next_page=next_page,
439
435
  min_score=min_score,
440
- )
436
+ ), result_resource_ids
441
437
 
442
438
 
443
439
  @merge_observer.wrap({"type": "merge_relations"})
444
440
  async def merge_relations_results(
445
441
  relations_responses: list[RelationSearchResponse],
446
- query: EntitiesSubgraphRequest,
442
+ query_entry_points: Iterable[RelationNode],
447
443
  only_with_metadata: bool = False,
448
444
  only_agentic: bool = False,
449
445
  only_entity_to_entity: bool = False,
@@ -453,7 +449,7 @@ async def merge_relations_results(
453
449
  None,
454
450
  _merge_relations_results,
455
451
  relations_responses,
456
- query,
452
+ query_entry_points,
457
453
  only_with_metadata,
458
454
  only_agentic,
459
455
  only_entity_to_entity,
@@ -462,26 +458,26 @@ async def merge_relations_results(
462
458
 
463
459
  def _merge_relations_results(
464
460
  relations_responses: list[RelationSearchResponse],
465
- query: EntitiesSubgraphRequest,
461
+ query_entry_points: Iterable[RelationNode],
466
462
  only_with_metadata: bool,
467
463
  only_agentic: bool,
468
464
  only_entity_to_entity: bool,
469
465
  ) -> Relations:
470
- """
471
- Merge relation search responses into a single Relations object while applying filters.
466
+ """Merge relation search responses into a single Relations object while applying filters.
467
+
468
+ - When `only_with_metadata` is enabled, only include paths with metadata
469
+ (this can include paragraph_id and entity positions among other things)
472
470
 
473
- Args:
474
- relations_responses: List of relation search responses
475
- query: EntitiesSubgraphRequest object
476
- only_with_metadata: If True, only include relations with metadata. This metadata includes paragraph_id and entity positions among other things.
477
- only_agentic: If True, only include relations extracted by a Graph Extraction Agent.
471
+ - When `only_agentic` is enabled, ony include relations extracted by a Graph
472
+ Extraction Agent
473
+
474
+ - When `only_entity_to_entity` is enabled, only include relations between
475
+ nodes with type ENTITY
478
476
 
479
- Returns:
480
- Relations
481
477
  """
482
478
  relations = Relations(entities={})
483
479
 
484
- for entry_point in query.entry_points:
480
+ for entry_point in query_entry_points:
485
481
  relations.entities[entry_point.value] = EntitySubgraph(related_to=[])
486
482
 
487
483
  for relation_response in relations_responses:
@@ -541,14 +537,11 @@ def _merge_relations_results(
541
537
  @merge_observer.wrap({"type": "merge"})
542
538
  async def merge_results(
543
539
  search_responses: list[SearchResponse],
544
- top_k: int,
540
+ retrieval: UnitRetrieval,
545
541
  kbid: str,
546
542
  show: list[ResourceProperties],
547
543
  field_type_filter: list[FieldTypeName],
548
544
  extracted: list[ExtractedDataTypeName],
549
- sort: SortOptions,
550
- requested_relations: EntitiesSubgraphRequest,
551
- min_score: MinScore,
552
545
  highlight: bool = False,
553
546
  ) -> KnowledgeboxSearchResults:
554
547
  paragraphs = []
@@ -565,25 +558,45 @@ async def merge_results(
565
558
  api_results = KnowledgeboxSearchResults()
566
559
 
567
560
  resources: list[str] = list()
568
- api_results.fulltext = await merge_documents_results(
569
- documents, resources, top_k, kbid, sort, min_score=min_score.bm25
570
- )
571
561
 
572
- api_results.paragraphs = await merge_paragraph_results(
573
- paragraphs,
574
- resources,
575
- kbid,
576
- top_k,
577
- highlight,
578
- sort,
579
- min_score=min_score.bm25,
580
- )
562
+ if retrieval.query.fulltext is not None:
563
+ api_results.fulltext, matched_resources = await merge_documents_results(
564
+ kbid,
565
+ documents,
566
+ query=retrieval.query.fulltext,
567
+ top_k=retrieval.top_k,
568
+ )
569
+ resources.extend(matched_resources)
581
570
 
582
- api_results.sentences = await merge_vectors_results(
583
- vectors, resources, kbid, top_k, min_score=min_score.semantic
584
- )
571
+ if retrieval.query.keyword is not None:
572
+ sort = SortOptions(
573
+ field=retrieval.query.keyword.order_by,
574
+ order=retrieval.query.keyword.sort,
575
+ limit=None, # unused
576
+ )
577
+ api_results.paragraphs, matched_resources = await merge_paragraph_results(
578
+ kbid,
579
+ paragraphs,
580
+ retrieval.top_k,
581
+ highlight,
582
+ sort,
583
+ min_score=retrieval.query.keyword.min_score,
584
+ )
585
+ resources.extend(matched_resources)
586
+
587
+ if retrieval.query.semantic is not None:
588
+ api_results.sentences = await merge_vectors_results(
589
+ vectors,
590
+ resources,
591
+ kbid,
592
+ retrieval.top_k,
593
+ min_score=retrieval.query.semantic.min_score,
594
+ )
585
595
 
586
- api_results.relations = await merge_relations_results(relations, requested_relations)
596
+ if retrieval.query.relation is not None:
597
+ api_results.relations = await merge_relations_results(
598
+ relations, retrieval.query.relation.detected_entities
599
+ )
587
600
 
588
601
  api_results.resources = await fetch_resources(resources, kbid, show, field_type_filter, extracted)
589
602
  return api_results
@@ -602,11 +615,9 @@ async def merge_paragraphs_results(
602
615
 
603
616
  api_results = ResourceSearchResults()
604
617
 
605
- resources: list[str] = list()
606
- api_results.paragraphs = await merge_paragraph_results(
607
- paragraphs,
608
- resources,
618
+ api_results.paragraphs, _ = await merge_paragraph_results(
609
619
  kbid,
620
+ paragraphs,
610
621
  top_k,
611
622
  highlight=highlight_split,
612
623
  sort=SortOptions(