nucliadb 6.3.7.post4066__py3-none-any.whl → 6.3.7.post4068__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/search/api/v1/search.py +6 -39
- nucliadb/search/search/chat/ask.py +19 -26
- nucliadb/search/search/chat/query.py +6 -6
- nucliadb/search/search/find.py +21 -91
- nucliadb/search/search/find_merge.py +18 -9
- nucliadb/search/search/graph_strategy.py +9 -10
- nucliadb/search/search/merge.py +76 -65
- nucliadb/search/search/query.py +2 -455
- nucliadb/search/search/query_parser/fetcher.py +41 -0
- nucliadb/search/search/query_parser/models.py +82 -8
- nucliadb/search/search/query_parser/parsers/ask.py +77 -0
- nucliadb/search/search/query_parser/parsers/common.py +189 -0
- nucliadb/search/search/query_parser/parsers/find.py +175 -13
- nucliadb/search/search/query_parser/parsers/search.py +249 -0
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +176 -0
- nucliadb/search/search/rerankers.py +4 -2
- {nucliadb-6.3.7.post4066.dist-info → nucliadb-6.3.7.post4068.dist-info}/METADATA +6 -6
- {nucliadb-6.3.7.post4066.dist-info → nucliadb-6.3.7.post4068.dist-info}/RECORD +21 -17
- {nucliadb-6.3.7.post4066.dist-info → nucliadb-6.3.7.post4068.dist-info}/WHEEL +0 -0
- {nucliadb-6.3.7.post4066.dist-info → nucliadb-6.3.7.post4068.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.3.7.post4066.dist-info → nucliadb-6.3.7.post4068.dist-info}/top_level.txt +0 -0
nucliadb/search/search/merge.py
CHANGED
@@ -20,7 +20,7 @@
|
|
20
20
|
import asyncio
|
21
21
|
import datetime
|
22
22
|
import math
|
23
|
-
from typing import Any, Optional, Set, Union
|
23
|
+
from typing import Any, Iterable, Optional, Set, Union
|
24
24
|
|
25
25
|
from nucliadb.common.ids import FieldId, ParagraphId
|
26
26
|
from nucliadb.common.models_utils import from_proto
|
@@ -33,6 +33,7 @@ from nucliadb.search.search.fetch import (
|
|
33
33
|
get_labels_resource,
|
34
34
|
get_seconds_paragraph,
|
35
35
|
)
|
36
|
+
from nucliadb.search.search.query_parser.models import FulltextQuery, UnitRetrieval
|
36
37
|
from nucliadb_models.common import FieldTypeName
|
37
38
|
from nucliadb_models.labels import translate_system_to_alias_label
|
38
39
|
from nucliadb_models.metadata import RelationType
|
@@ -43,7 +44,6 @@ from nucliadb_models.search import (
|
|
43
44
|
EntityType,
|
44
45
|
KnowledgeboxSearchResults,
|
45
46
|
KnowledgeboxSuggestResults,
|
46
|
-
MinScore,
|
47
47
|
Paragraph,
|
48
48
|
Paragraphs,
|
49
49
|
RelatedEntities,
|
@@ -65,7 +65,6 @@ from nucliadb_protos.nodereader_pb2 import (
|
|
65
65
|
DocumentResult,
|
66
66
|
DocumentScored,
|
67
67
|
DocumentSearchResponse,
|
68
|
-
EntitiesSubgraphRequest,
|
69
68
|
ParagraphResult,
|
70
69
|
ParagraphSearchResponse,
|
71
70
|
RelationSearchResponse,
|
@@ -129,21 +128,17 @@ async def get_sort_value(
|
|
129
128
|
|
130
129
|
|
131
130
|
async def merge_documents_results(
|
132
|
-
document_responses: list[DocumentSearchResponse],
|
133
|
-
resources: list[str],
|
134
|
-
top_k: int,
|
135
131
|
kbid: str,
|
136
|
-
|
137
|
-
|
138
|
-
|
132
|
+
responses: list[DocumentSearchResponse],
|
133
|
+
*,
|
134
|
+
query: FulltextQuery,
|
135
|
+
top_k: int,
|
136
|
+
) -> tuple[Resources, list[str]]:
|
139
137
|
raw_resource_list: list[tuple[DocumentResult, SortValue]] = []
|
140
138
|
facets: dict[str, Any] = {}
|
141
|
-
query = None
|
142
139
|
total = 0
|
143
140
|
next_page = False
|
144
|
-
for document_response in
|
145
|
-
if query is None:
|
146
|
-
query = document_response.query
|
141
|
+
for document_response in responses:
|
147
142
|
if document_response.facets:
|
148
143
|
for key, value in document_response.facets.items():
|
149
144
|
key = translate_system_to_alias_label(key)
|
@@ -155,7 +150,7 @@ async def merge_documents_results(
|
|
155
150
|
if document_response.next_page:
|
156
151
|
next_page = True
|
157
152
|
for result in document_response.results:
|
158
|
-
sort_value = await get_sort_value(result,
|
153
|
+
sort_value = await get_sort_value(result, query.order_by, kbid)
|
159
154
|
if sort_value is not None:
|
160
155
|
raw_resource_list.append((result, sort_value))
|
161
156
|
total += document_response.total
|
@@ -163,8 +158,9 @@ async def merge_documents_results(
|
|
163
158
|
# We need to cut first and then sort, otherwise the page will be wrong if the order is DESC
|
164
159
|
raw_resource_list, has_more = cut_page(raw_resource_list, top_k)
|
165
160
|
next_page = next_page or has_more
|
166
|
-
raw_resource_list.sort(key=lambda x: x[1], reverse=(sort
|
161
|
+
raw_resource_list.sort(key=lambda x: x[1], reverse=(query.sort == SortOrder.DESC))
|
167
162
|
|
163
|
+
result_resource_ids = []
|
168
164
|
result_resource_list: list[ResourceResult] = []
|
169
165
|
for result, _ in raw_resource_list:
|
170
166
|
labels = await get_labels_resource(result, kbid)
|
@@ -179,26 +175,26 @@ async def merge_documents_results(
|
|
179
175
|
labels=labels,
|
180
176
|
)
|
181
177
|
)
|
182
|
-
if result.uuid not in
|
183
|
-
|
178
|
+
if result.uuid not in result_resource_ids:
|
179
|
+
result_resource_ids.append(result.uuid)
|
184
180
|
|
185
181
|
return Resources(
|
186
182
|
facets=facets,
|
187
183
|
results=result_resource_list,
|
188
|
-
query=query,
|
184
|
+
query=query.query,
|
189
185
|
total=total,
|
190
186
|
page_number=0, # Bw/c with pagination
|
191
187
|
page_size=top_k,
|
192
188
|
next_page=next_page,
|
193
|
-
min_score=min_score,
|
194
|
-
)
|
189
|
+
min_score=query.min_score,
|
190
|
+
), result_resource_ids
|
195
191
|
|
196
192
|
|
197
193
|
async def merge_suggest_paragraph_results(
|
198
194
|
suggest_responses: list[SuggestResponse],
|
199
195
|
kbid: str,
|
200
196
|
highlight: bool,
|
201
|
-
):
|
197
|
+
) -> Paragraphs:
|
202
198
|
raw_paragraph_list: list[ParagraphResult] = []
|
203
199
|
query = None
|
204
200
|
ematches = None
|
@@ -266,7 +262,7 @@ async def merge_vectors_results(
|
|
266
262
|
kbid: str,
|
267
263
|
top_k: int,
|
268
264
|
min_score: Optional[float] = None,
|
269
|
-
):
|
265
|
+
) -> Sentences:
|
270
266
|
facets: dict[str, Any] = {}
|
271
267
|
raw_vectors_list: list[DocumentScored] = []
|
272
268
|
|
@@ -339,14 +335,13 @@ async def merge_vectors_results(
|
|
339
335
|
|
340
336
|
|
341
337
|
async def merge_paragraph_results(
|
342
|
-
paragraph_responses: list[ParagraphSearchResponse],
|
343
|
-
resources: list[str],
|
344
338
|
kbid: str,
|
339
|
+
paragraph_responses: list[ParagraphSearchResponse],
|
345
340
|
top_k: int,
|
346
341
|
highlight: bool,
|
347
342
|
sort: SortOptions,
|
348
343
|
min_score: float,
|
349
|
-
) -> Paragraphs:
|
344
|
+
) -> tuple[Paragraphs, list[str]]:
|
350
345
|
raw_paragraph_list: list[tuple[ParagraphResult, SortValue]] = []
|
351
346
|
facets: dict[str, Any] = {}
|
352
347
|
query = None
|
@@ -379,6 +374,7 @@ async def merge_paragraph_results(
|
|
379
374
|
raw_paragraph_list, has_more = cut_page(raw_paragraph_list, top_k)
|
380
375
|
next_page = next_page or has_more
|
381
376
|
|
377
|
+
result_resource_ids = []
|
382
378
|
result_paragraph_list: list[Paragraph] = []
|
383
379
|
for result, _ in raw_paragraph_list:
|
384
380
|
_, field_type, field = result.field.split("/")
|
@@ -426,8 +422,8 @@ async def merge_paragraph_results(
|
|
426
422
|
new_paragraph.end_seconds = seconds_positions[1]
|
427
423
|
|
428
424
|
result_paragraph_list.append(new_paragraph)
|
429
|
-
if new_paragraph.rid not in
|
430
|
-
|
425
|
+
if new_paragraph.rid not in result_resource_ids:
|
426
|
+
result_resource_ids.append(new_paragraph.rid)
|
431
427
|
return Paragraphs(
|
432
428
|
results=result_paragraph_list,
|
433
429
|
facets=facets,
|
@@ -437,13 +433,13 @@ async def merge_paragraph_results(
|
|
437
433
|
page_size=top_k,
|
438
434
|
next_page=next_page,
|
439
435
|
min_score=min_score,
|
440
|
-
)
|
436
|
+
), result_resource_ids
|
441
437
|
|
442
438
|
|
443
439
|
@merge_observer.wrap({"type": "merge_relations"})
|
444
440
|
async def merge_relations_results(
|
445
441
|
relations_responses: list[RelationSearchResponse],
|
446
|
-
|
442
|
+
query_entry_points: Iterable[RelationNode],
|
447
443
|
only_with_metadata: bool = False,
|
448
444
|
only_agentic: bool = False,
|
449
445
|
only_entity_to_entity: bool = False,
|
@@ -453,7 +449,7 @@ async def merge_relations_results(
|
|
453
449
|
None,
|
454
450
|
_merge_relations_results,
|
455
451
|
relations_responses,
|
456
|
-
|
452
|
+
query_entry_points,
|
457
453
|
only_with_metadata,
|
458
454
|
only_agentic,
|
459
455
|
only_entity_to_entity,
|
@@ -462,26 +458,26 @@ async def merge_relations_results(
|
|
462
458
|
|
463
459
|
def _merge_relations_results(
|
464
460
|
relations_responses: list[RelationSearchResponse],
|
465
|
-
|
461
|
+
query_entry_points: Iterable[RelationNode],
|
466
462
|
only_with_metadata: bool,
|
467
463
|
only_agentic: bool,
|
468
464
|
only_entity_to_entity: bool,
|
469
465
|
) -> Relations:
|
470
|
-
"""
|
471
|
-
|
466
|
+
"""Merge relation search responses into a single Relations object while applying filters.
|
467
|
+
|
468
|
+
- When `only_with_metadata` is enabled, only include paths with metadata
|
469
|
+
(this can include paragraph_id and entity positions among other things)
|
472
470
|
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
471
|
+
- When `only_agentic` is enabled, ony include relations extracted by a Graph
|
472
|
+
Extraction Agent
|
473
|
+
|
474
|
+
- When `only_entity_to_entity` is enabled, only include relations between
|
475
|
+
nodes with type ENTITY
|
478
476
|
|
479
|
-
Returns:
|
480
|
-
Relations
|
481
477
|
"""
|
482
478
|
relations = Relations(entities={})
|
483
479
|
|
484
|
-
for entry_point in
|
480
|
+
for entry_point in query_entry_points:
|
485
481
|
relations.entities[entry_point.value] = EntitySubgraph(related_to=[])
|
486
482
|
|
487
483
|
for relation_response in relations_responses:
|
@@ -541,14 +537,11 @@ def _merge_relations_results(
|
|
541
537
|
@merge_observer.wrap({"type": "merge"})
|
542
538
|
async def merge_results(
|
543
539
|
search_responses: list[SearchResponse],
|
544
|
-
|
540
|
+
retrieval: UnitRetrieval,
|
545
541
|
kbid: str,
|
546
542
|
show: list[ResourceProperties],
|
547
543
|
field_type_filter: list[FieldTypeName],
|
548
544
|
extracted: list[ExtractedDataTypeName],
|
549
|
-
sort: SortOptions,
|
550
|
-
requested_relations: EntitiesSubgraphRequest,
|
551
|
-
min_score: MinScore,
|
552
545
|
highlight: bool = False,
|
553
546
|
) -> KnowledgeboxSearchResults:
|
554
547
|
paragraphs = []
|
@@ -565,25 +558,45 @@ async def merge_results(
|
|
565
558
|
api_results = KnowledgeboxSearchResults()
|
566
559
|
|
567
560
|
resources: list[str] = list()
|
568
|
-
api_results.fulltext = await merge_documents_results(
|
569
|
-
documents, resources, top_k, kbid, sort, min_score=min_score.bm25
|
570
|
-
)
|
571
561
|
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
)
|
562
|
+
if retrieval.query.fulltext is not None:
|
563
|
+
api_results.fulltext, matched_resources = await merge_documents_results(
|
564
|
+
kbid,
|
565
|
+
documents,
|
566
|
+
query=retrieval.query.fulltext,
|
567
|
+
top_k=retrieval.top_k,
|
568
|
+
)
|
569
|
+
resources.extend(matched_resources)
|
581
570
|
|
582
|
-
|
583
|
-
|
584
|
-
|
571
|
+
if retrieval.query.keyword is not None:
|
572
|
+
sort = SortOptions(
|
573
|
+
field=retrieval.query.keyword.order_by,
|
574
|
+
order=retrieval.query.keyword.sort,
|
575
|
+
limit=None, # unused
|
576
|
+
)
|
577
|
+
api_results.paragraphs, matched_resources = await merge_paragraph_results(
|
578
|
+
kbid,
|
579
|
+
paragraphs,
|
580
|
+
retrieval.top_k,
|
581
|
+
highlight,
|
582
|
+
sort,
|
583
|
+
min_score=retrieval.query.keyword.min_score,
|
584
|
+
)
|
585
|
+
resources.extend(matched_resources)
|
586
|
+
|
587
|
+
if retrieval.query.semantic is not None:
|
588
|
+
api_results.sentences = await merge_vectors_results(
|
589
|
+
vectors,
|
590
|
+
resources,
|
591
|
+
kbid,
|
592
|
+
retrieval.top_k,
|
593
|
+
min_score=retrieval.query.semantic.min_score,
|
594
|
+
)
|
585
595
|
|
586
|
-
|
596
|
+
if retrieval.query.relation is not None:
|
597
|
+
api_results.relations = await merge_relations_results(
|
598
|
+
relations, retrieval.query.relation.detected_entities
|
599
|
+
)
|
587
600
|
|
588
601
|
api_results.resources = await fetch_resources(resources, kbid, show, field_type_filter, extracted)
|
589
602
|
return api_results
|
@@ -602,11 +615,9 @@ async def merge_paragraphs_results(
|
|
602
615
|
|
603
616
|
api_results = ResourceSearchResults()
|
604
617
|
|
605
|
-
|
606
|
-
api_results.paragraphs = await merge_paragraph_results(
|
607
|
-
paragraphs,
|
608
|
-
resources,
|
618
|
+
api_results.paragraphs, _ = await merge_paragraph_results(
|
609
619
|
kbid,
|
620
|
+
paragraphs,
|
610
621
|
top_k,
|
611
622
|
highlight=highlight_split,
|
612
623
|
sort=SortOptions(
|