nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
from nucliadb.common.ids import FieldId
|
|
22
|
+
from nucliadb.search.search.metrics import Metrics
|
|
23
|
+
from nucliadb_models.augment import AugmentRequest, AugmentResponse
|
|
24
|
+
from nucliadb_models.labels import KnowledgeBoxLabels
|
|
25
|
+
from nucliadb_models.retrieval import RetrievalRequest, RetrievalResponse
|
|
26
|
+
from nucliadb_models.search import FindRequest, Image, KnowledgeboxFindResults, NucliaDBClientType
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# TODO(decoupled-ask): replace this for a sdk.find call when moving /ask to RAO
|
|
30
|
+
async def find(
|
|
31
|
+
kbid: str,
|
|
32
|
+
item: FindRequest,
|
|
33
|
+
x_ndb_client: NucliaDBClientType,
|
|
34
|
+
x_nucliadb_user: str,
|
|
35
|
+
x_forwarded_for: str,
|
|
36
|
+
# REVIEW(decoupled-ask): once in an SDK metrics, we'll lose track of metrics
|
|
37
|
+
metrics: Metrics,
|
|
38
|
+
) -> tuple[KnowledgeboxFindResults, bool]:
|
|
39
|
+
from nucliadb.search.search.find import find
|
|
40
|
+
|
|
41
|
+
results, incomplete, _ = await find(
|
|
42
|
+
kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for, metrics
|
|
43
|
+
)
|
|
44
|
+
return results, incomplete
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# TODO(decoupled-ask): replace this for a sdk.retrieve call when moving /ask to RAO
|
|
48
|
+
async def retrieve(
|
|
49
|
+
kbid: str,
|
|
50
|
+
item: RetrievalRequest,
|
|
51
|
+
*,
|
|
52
|
+
x_ndb_client: NucliaDBClientType,
|
|
53
|
+
x_nucliadb_user: str,
|
|
54
|
+
x_forwarded_for: str,
|
|
55
|
+
) -> RetrievalResponse:
|
|
56
|
+
from nucliadb.search.api.v1.retrieve import retrieve_endpoint
|
|
57
|
+
|
|
58
|
+
return await retrieve_endpoint(
|
|
59
|
+
kbid,
|
|
60
|
+
item,
|
|
61
|
+
x_ndb_client=x_ndb_client,
|
|
62
|
+
x_nucliadb_user=x_nucliadb_user,
|
|
63
|
+
x_forwarded_for=x_forwarded_for,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# TODO(decoupled-ask): replace this for a sdk.augment call when moving /ask to RAO
|
|
68
|
+
async def augment(kbid: str, item: AugmentRequest) -> AugmentResponse:
|
|
69
|
+
from nucliadb.search.api.v1.augment import augment_endpoint
|
|
70
|
+
|
|
71
|
+
return await augment_endpoint(kbid, item)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# TODO(decoupled-ask): replace this for a sdk.labelsets call when moving /ask to RAO
|
|
75
|
+
async def labelsets(kbid: str) -> KnowledgeBoxLabels:
|
|
76
|
+
from nucliadb.reader.api.v1.services import get_labelsets
|
|
77
|
+
|
|
78
|
+
return await get_labelsets(kbid)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# TODO(decoupled-ask): replace this for a sdk.download call when moving /ask to RAO
|
|
82
|
+
async def download_image(kbid: str, field_id: FieldId, path: str, *, mime_type: str) -> Image | None:
|
|
83
|
+
from nucliadb.search.search.hydrator.images import download_image
|
|
84
|
+
|
|
85
|
+
return await download_image(kbid, field_id, path, mime_type=mime_type)
|
nucliadb/search/search/fetch.py
CHANGED
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
21
|
from contextvars import ContextVar
|
|
22
|
-
from typing import Optional
|
|
23
22
|
|
|
24
23
|
from nidx_protos.nodereader_pb2 import DocumentResult, ParagraphResult
|
|
25
24
|
|
|
@@ -36,7 +35,7 @@ from nucliadb_protos.resources_pb2 import Paragraph
|
|
|
36
35
|
from nucliadb_utils import const
|
|
37
36
|
from nucliadb_utils.utilities import has_feature
|
|
38
37
|
|
|
39
|
-
rcache: ContextVar[
|
|
38
|
+
rcache: ContextVar[dict[str, ResourceORM] | None] = ContextVar("rcache", default=None)
|
|
40
39
|
|
|
41
40
|
|
|
42
41
|
async def fetch_resources(
|
|
@@ -79,7 +78,7 @@ async def fetch_resources(
|
|
|
79
78
|
|
|
80
79
|
async def get_paragraph_from_resource(
|
|
81
80
|
orm_resource: ResourceORM, result: ParagraphResult
|
|
82
|
-
) ->
|
|
81
|
+
) -> Paragraph | None:
|
|
83
82
|
_, field_type, field = result.field.split("/")
|
|
84
83
|
field_type_int = FIELD_TYPE_STR_TO_PB[field_type]
|
|
85
84
|
field_obj = await orm_resource.get_field(field, field_type_int, load=False)
|
|
@@ -144,7 +143,7 @@ async def get_labels_paragraph(result: ParagraphResult, kbid: str) -> list[str]:
|
|
|
144
143
|
|
|
145
144
|
async def get_seconds_paragraph(
|
|
146
145
|
result: ParagraphResult, kbid: str
|
|
147
|
-
) ->
|
|
146
|
+
) -> tuple[list[int], list[int]] | None:
|
|
148
147
|
orm_resource = await cache.get_resource(kbid, result.uuid)
|
|
149
148
|
|
|
150
149
|
if orm_resource is None:
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
from collections.abc import Iterator
|
|
21
|
-
from typing import Any
|
|
21
|
+
from typing import Any
|
|
22
22
|
|
|
23
23
|
from nucliadb.common.exceptions import InvalidQueryError
|
|
24
24
|
from nucliadb_models.labels import translate_alias_to_system_label
|
|
@@ -108,7 +108,7 @@ def split_labels_by_type(
|
|
|
108
108
|
|
|
109
109
|
def is_paragraph_labelset_kind(labelset_id: str, classification_labels: knowledgebox_pb2.Labels) -> bool:
|
|
110
110
|
try:
|
|
111
|
-
labelset:
|
|
111
|
+
labelset: knowledgebox_pb2.LabelSet | None = classification_labels.labelset.get(labelset_id)
|
|
112
112
|
if labelset is None:
|
|
113
113
|
return False
|
|
114
114
|
return knowledgebox_pb2.LabelSet.LabelSetKind.PARAGRAPHS in labelset.kind
|
|
@@ -117,7 +117,7 @@ def is_paragraph_labelset_kind(labelset_id: str, classification_labels: knowledg
|
|
|
117
117
|
return False
|
|
118
118
|
|
|
119
119
|
|
|
120
|
-
def flatten_filter_literals(filters:
|
|
120
|
+
def flatten_filter_literals(filters: list[str] | dict[str, Any]) -> list[str]:
|
|
121
121
|
if isinstance(filters, list):
|
|
122
122
|
return filters
|
|
123
123
|
else:
|
|
@@ -130,20 +130,17 @@ def iter_filter_expression_literals(expression: dict[str, Any]) -> Iterator[str]
|
|
|
130
130
|
return
|
|
131
131
|
|
|
132
132
|
if "not" in expression:
|
|
133
|
-
|
|
134
|
-
yield label
|
|
133
|
+
yield from iter_filter_expression_literals(expression["not"])
|
|
135
134
|
return
|
|
136
135
|
|
|
137
136
|
if "and" in expression:
|
|
138
137
|
for and_term in expression["and"]:
|
|
139
|
-
|
|
140
|
-
yield label
|
|
138
|
+
yield from iter_filter_expression_literals(and_term)
|
|
141
139
|
return
|
|
142
140
|
|
|
143
141
|
if "or" in expression:
|
|
144
142
|
for or_term in expression["or"]:
|
|
145
|
-
|
|
146
|
-
yield label
|
|
143
|
+
yield from iter_filter_expression_literals(or_term)
|
|
147
144
|
return
|
|
148
145
|
|
|
149
146
|
|
|
@@ -151,7 +148,7 @@ def has_classification_label_filters(filters: list[str]) -> bool:
|
|
|
151
148
|
return any(label.startswith(CLASSIFICATION_LABEL_PREFIX) for label in filters)
|
|
152
149
|
|
|
153
150
|
|
|
154
|
-
def convert_to_node_filters(filters:
|
|
151
|
+
def convert_to_node_filters(filters: list[str] | list[Filter]) -> dict[str, Any]:
|
|
155
152
|
if len(filters) == 0:
|
|
156
153
|
return {}
|
|
157
154
|
|
|
@@ -161,7 +158,7 @@ def convert_to_node_filters(filters: Union[list[str], list[Filter]]) -> dict[str
|
|
|
161
158
|
return {"and": [convert_filter_to_node_schema(fltr) for fltr in filters]}
|
|
162
159
|
|
|
163
160
|
|
|
164
|
-
def convert_filter_to_node_schema(fltr:
|
|
161
|
+
def convert_filter_to_node_schema(fltr: str | Filter) -> dict[str, Any]:
|
|
165
162
|
if isinstance(fltr, str):
|
|
166
163
|
return {"literal": fltr}
|
|
167
164
|
|
nucliadb/search/search/find.py
CHANGED
|
@@ -23,7 +23,6 @@ from time import time
|
|
|
23
23
|
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
|
24
24
|
from nucliadb.common.external_index_providers.manager import get_external_index_manager
|
|
25
25
|
from nucliadb.common.models_utils import to_proto
|
|
26
|
-
from nucliadb.search.requesters.utils import Method, nidx_query
|
|
27
26
|
from nucliadb.search.search.find_merge import (
|
|
28
27
|
build_find_response,
|
|
29
28
|
compose_find_resources,
|
|
@@ -38,14 +37,16 @@ from nucliadb.search.search.metrics import (
|
|
|
38
37
|
)
|
|
39
38
|
from nucliadb.search.search.query_parser.models import ParsedQuery
|
|
40
39
|
from nucliadb.search.search.query_parser.parsers import parse_find
|
|
41
|
-
from nucliadb.search.search.query_parser.parsers.unit_retrieval import
|
|
42
|
-
|
|
43
|
-
|
|
40
|
+
from nucliadb.search.search.query_parser.parsers.unit_retrieval import (
|
|
41
|
+
convert_retrieval_to_proto,
|
|
42
|
+
get_rephrased_query,
|
|
43
|
+
is_incomplete,
|
|
44
44
|
)
|
|
45
45
|
from nucliadb.search.search.rerankers import (
|
|
46
46
|
RerankingOptions,
|
|
47
47
|
get_reranker,
|
|
48
48
|
)
|
|
49
|
+
from nucliadb.search.search.retrieval import text_block_search
|
|
49
50
|
from nucliadb.search.settings import settings
|
|
50
51
|
from nucliadb_models.search import (
|
|
51
52
|
FindRequest,
|
|
@@ -68,18 +69,16 @@ async def find(
|
|
|
68
69
|
) -> tuple[KnowledgeboxFindResults, bool, ParsedQuery]:
|
|
69
70
|
external_index_manager = await get_external_index_manager(kbid=kbid)
|
|
70
71
|
if external_index_manager is not None:
|
|
71
|
-
return await
|
|
72
|
+
return await _external_index_find(
|
|
72
73
|
kbid,
|
|
73
74
|
item,
|
|
74
75
|
external_index_manager,
|
|
75
76
|
)
|
|
76
77
|
else:
|
|
77
|
-
return await
|
|
78
|
-
kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for, metrics
|
|
79
|
-
)
|
|
78
|
+
return await _ndb_index_find(kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for, metrics)
|
|
80
79
|
|
|
81
80
|
|
|
82
|
-
async def
|
|
81
|
+
async def _ndb_index_find(
|
|
83
82
|
kbid: str,
|
|
84
83
|
item: FindRequest,
|
|
85
84
|
x_ndb_client: NucliaDBClientType,
|
|
@@ -95,32 +94,37 @@ async def _index_node_retrieval(
|
|
|
95
94
|
assert parsed.retrieval.rank_fusion is not None and parsed.retrieval.reranker is not None, (
|
|
96
95
|
"find parser must provide rank fusion and reranker algorithms"
|
|
97
96
|
)
|
|
98
|
-
rank_fusion = get_rank_fusion(parsed.retrieval.rank_fusion)
|
|
99
97
|
reranker = get_reranker(parsed.retrieval.reranker)
|
|
100
|
-
(
|
|
101
|
-
|
|
102
|
-
incomplete_results,
|
|
103
|
-
autofilters,
|
|
104
|
-
rephrased_query,
|
|
105
|
-
) = await legacy_convert_retrieval_to_proto(parsed)
|
|
98
|
+
incomplete_results = is_incomplete(parsed.retrieval)
|
|
99
|
+
rephrased_query = get_rephrased_query(parsed)
|
|
106
100
|
|
|
107
101
|
with metrics.time("index_search"):
|
|
108
|
-
|
|
102
|
+
text_blocks, pb_query, pb_response, queried_shards = await text_block_search(
|
|
103
|
+
kbid, parsed.retrieval
|
|
104
|
+
)
|
|
109
105
|
|
|
110
106
|
# Rank fusion merge, cut, hydrate and rerank
|
|
111
107
|
with metrics.time("results_merge"):
|
|
112
|
-
|
|
113
|
-
results,
|
|
114
|
-
retrieval=parsed.retrieval,
|
|
115
|
-
kbid=kbid,
|
|
116
|
-
query=pb_query.body,
|
|
117
|
-
rephrased_query=rephrased_query,
|
|
108
|
+
resource_hydration_options = ResourceHydrationOptions(
|
|
118
109
|
show=item.show,
|
|
119
110
|
extracted=item.extracted,
|
|
120
111
|
field_type_filter=item.field_type_filter,
|
|
112
|
+
)
|
|
113
|
+
text_block_hydration_options = TextBlockHydrationOptions(
|
|
121
114
|
highlight=item.highlight,
|
|
122
|
-
|
|
115
|
+
ematches=pb_response.paragraph.ematches, # type: ignore
|
|
116
|
+
)
|
|
117
|
+
search_results = await build_find_response(
|
|
118
|
+
pb_response,
|
|
119
|
+
text_blocks,
|
|
120
|
+
pb_response.graph,
|
|
121
|
+
retrieval=parsed.retrieval,
|
|
122
|
+
kbid=kbid,
|
|
123
|
+
query=item.query,
|
|
124
|
+
rephrased_query=rephrased_query,
|
|
123
125
|
reranker=reranker,
|
|
126
|
+
resource_hydration_options=resource_hydration_options,
|
|
127
|
+
text_block_hydration_options=text_block_hydration_options,
|
|
124
128
|
)
|
|
125
129
|
|
|
126
130
|
search_time = time() - start_time
|
|
@@ -137,7 +141,6 @@ async def _index_node_retrieval(
|
|
|
137
141
|
)
|
|
138
142
|
|
|
139
143
|
search_results.shards = queried_shards
|
|
140
|
-
search_results.autofilters = autofilters
|
|
141
144
|
|
|
142
145
|
ndb_time = metrics["index_search"] + metrics["results_merge"]
|
|
143
146
|
if metrics["index_search"] > settings.slow_node_query_log_threshold:
|
|
@@ -168,7 +171,7 @@ async def _index_node_retrieval(
|
|
|
168
171
|
return search_results, incomplete_results, parsed
|
|
169
172
|
|
|
170
173
|
|
|
171
|
-
async def
|
|
174
|
+
async def _external_index_find(
|
|
172
175
|
kbid: str,
|
|
173
176
|
item: FindRequest,
|
|
174
177
|
external_index_manager: ExternalIndexManager,
|
|
@@ -180,12 +183,12 @@ async def _external_index_retrieval(
|
|
|
180
183
|
parsed = await parse_find(kbid, item)
|
|
181
184
|
assert parsed.retrieval.reranker is not None, "find parser must provide a reranking algorithm"
|
|
182
185
|
reranker = get_reranker(parsed.retrieval.reranker)
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
)
|
|
186
|
+
incomplete_results = is_incomplete(parsed.retrieval)
|
|
187
|
+
rephrased_query = get_rephrased_query(parsed)
|
|
188
|
+
search_request = convert_retrieval_to_proto(parsed.retrieval)
|
|
186
189
|
|
|
187
190
|
# Query index
|
|
188
|
-
query_results = await external_index_manager.query(search_request)
|
|
191
|
+
query_results = await external_index_manager.query(search_request)
|
|
189
192
|
|
|
190
193
|
# Hydrate and rerank results
|
|
191
194
|
text_blocks, resources, best_matches = await hydrate_and_rerank(
|
|
@@ -220,7 +223,6 @@ async def _external_index_retrieval(
|
|
|
220
223
|
page_number=0,
|
|
221
224
|
page_size=item.top_k,
|
|
222
225
|
relations=None, # Not implemented for external indexes yet
|
|
223
|
-
autofilters=[], # Not implemented for external indexes yet
|
|
224
226
|
min_score=results_min_score,
|
|
225
227
|
best_matches=best_matches,
|
|
226
228
|
# These are not used for external indexes
|