nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,510 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
|
|
23
|
+
from pydantic import ValidationError
|
|
24
|
+
|
|
25
|
+
from nucliadb.common.exceptions import InvalidQueryError
|
|
26
|
+
from nucliadb.common.filter_expression import filter_from_facet
|
|
27
|
+
from nucliadb.common.models_utils.from_proto import RelationNodeTypeMap
|
|
28
|
+
from nucliadb.search.search.chat.fetcher import RAOFetcher
|
|
29
|
+
from nucliadb.search.search.query_parser.exceptions import InternalParserError
|
|
30
|
+
from nucliadb.search.search.query_parser.models import (
|
|
31
|
+
RelationQuery,
|
|
32
|
+
)
|
|
33
|
+
from nucliadb.search.search.query_parser.old_filters import is_paragraph_label, translate_label
|
|
34
|
+
from nucliadb.search.search.query_parser.parsers.common import (
|
|
35
|
+
parse_keyword_min_score,
|
|
36
|
+
should_disable_vector_search,
|
|
37
|
+
)
|
|
38
|
+
from nucliadb.search.search.rerankers import NoopReranker, PredictReranker, Reranker
|
|
39
|
+
from nucliadb_models import retrieval as retrieval_models
|
|
40
|
+
from nucliadb_models import search as search_models
|
|
41
|
+
from nucliadb_models.common import FieldTypeName
|
|
42
|
+
from nucliadb_models.filters import (
|
|
43
|
+
And,
|
|
44
|
+
DateCreated,
|
|
45
|
+
DateModified,
|
|
46
|
+
Field,
|
|
47
|
+
FieldFilterExpression,
|
|
48
|
+
FilterExpression,
|
|
49
|
+
Keyword,
|
|
50
|
+
Not,
|
|
51
|
+
Or,
|
|
52
|
+
ParagraphFilterExpression,
|
|
53
|
+
Resource,
|
|
54
|
+
)
|
|
55
|
+
from nucliadb_models.retrieval import RetrievalRequest
|
|
56
|
+
from nucliadb_models.search import Filter, FindRequest
|
|
57
|
+
from nucliadb_protos import knowledgebox_pb2, utils_pb2
|
|
58
|
+
|
|
59
|
+
logger = logging.getLogger(__name__)
|
|
60
|
+
|
|
61
|
+
DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
async def rao_parse_find(
|
|
65
|
+
kbid: str, find_request: FindRequest
|
|
66
|
+
) -> tuple[RAOFetcher, RetrievalRequest, Reranker]:
|
|
67
|
+
# This is a thin layer to convert a FindRequest into a RetrievalRequest +
|
|
68
|
+
# some bw/c stuff we need while refactoring and decoupling code
|
|
69
|
+
|
|
70
|
+
fetcher = RAOFetcher(
|
|
71
|
+
kbid,
|
|
72
|
+
query=find_request.query,
|
|
73
|
+
user_vector=find_request.vector,
|
|
74
|
+
vectorset=find_request.vectorset,
|
|
75
|
+
rephrase=find_request.rephrase,
|
|
76
|
+
rephrase_prompt=find_request.rephrase_prompt,
|
|
77
|
+
generative_model=find_request.generative_model,
|
|
78
|
+
query_image=find_request.query_image,
|
|
79
|
+
)
|
|
80
|
+
parser = RAOFindParser(kbid, find_request, fetcher)
|
|
81
|
+
retrieval_request, reranker = await parser.parse()
|
|
82
|
+
return fetcher, retrieval_request, reranker
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class RAOFindParser:
|
|
86
|
+
def __init__(self, kbid: str, item: FindRequest, fetcher: RAOFetcher):
|
|
87
|
+
self.kbid = kbid
|
|
88
|
+
self.item = item
|
|
89
|
+
self.fetcher = fetcher
|
|
90
|
+
|
|
91
|
+
# cached data while parsing
|
|
92
|
+
self._query: retrieval_models.Query | None = None
|
|
93
|
+
|
|
94
|
+
async def parse(self) -> tuple[RetrievalRequest, Reranker]:
|
|
95
|
+
self._validate_request()
|
|
96
|
+
|
|
97
|
+
top_k = self.item.top_k
|
|
98
|
+
|
|
99
|
+
# parse search types (features)
|
|
100
|
+
|
|
101
|
+
self._query = retrieval_models.Query()
|
|
102
|
+
|
|
103
|
+
if search_models.FindOptions.KEYWORD in self.item.features:
|
|
104
|
+
self._query.keyword = await parse_keyword_query(self.item, fetcher=self.fetcher) # type: ignore
|
|
105
|
+
|
|
106
|
+
if search_models.FindOptions.SEMANTIC in self.item.features:
|
|
107
|
+
self._query.semantic = await parse_semantic_query(self.item, fetcher=self.fetcher) # type: ignore
|
|
108
|
+
|
|
109
|
+
if search_models.FindOptions.RELATIONS in self.item.features:
|
|
110
|
+
# skip, we'll do something about this later on
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
if search_models.FindOptions.GRAPH in self.item.features:
|
|
114
|
+
self._query.graph = await self._parse_graph_query()
|
|
115
|
+
|
|
116
|
+
filters = await self._parse_filters()
|
|
117
|
+
|
|
118
|
+
# rank fusion is just forwarded to /retrieve
|
|
119
|
+
rank_fusion = self.item.rank_fusion
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
reranker = self._parse_reranker()
|
|
123
|
+
except ValidationError as exc:
|
|
124
|
+
raise InternalParserError(f"Parsing error in reranker: {exc!s}") from exc
|
|
125
|
+
|
|
126
|
+
# As we'll call /retrieve, that has rank fusion integrated, we have to
|
|
127
|
+
# make sure we ask for enough results to rerank.
|
|
128
|
+
if isinstance(reranker, PredictReranker):
|
|
129
|
+
top_k = max(top_k, reranker.window)
|
|
130
|
+
|
|
131
|
+
retrieval = RetrievalRequest(
|
|
132
|
+
query=self._query,
|
|
133
|
+
top_k=top_k,
|
|
134
|
+
filters=filters,
|
|
135
|
+
rank_fusion=rank_fusion,
|
|
136
|
+
)
|
|
137
|
+
return retrieval, reranker
|
|
138
|
+
|
|
139
|
+
def _validate_request(self):
|
|
140
|
+
# synonyms are not compatible with vector/graph search
|
|
141
|
+
if (
|
|
142
|
+
self.item.with_synonyms
|
|
143
|
+
and self.item.query
|
|
144
|
+
and (
|
|
145
|
+
search_models.FindOptions.SEMANTIC in self.item.features
|
|
146
|
+
or search_models.FindOptions.RELATIONS in self.item.features
|
|
147
|
+
or search_models.FindOptions.GRAPH in self.item.features
|
|
148
|
+
)
|
|
149
|
+
):
|
|
150
|
+
raise InvalidQueryError(
|
|
151
|
+
"synonyms",
|
|
152
|
+
"Search with custom synonyms is only supported on paragraph and document search",
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if search_models.FindOptions.SEMANTIC in self.item.features:
|
|
156
|
+
if should_disable_vector_search(self.item):
|
|
157
|
+
self.item.features.remove(search_models.FindOptions.SEMANTIC)
|
|
158
|
+
|
|
159
|
+
if self.item.graph_query and search_models.FindOptions.GRAPH not in self.item.features:
|
|
160
|
+
raise InvalidQueryError("graph_query", "Using a graph query requires enabling graph feature")
|
|
161
|
+
|
|
162
|
+
async def _parse_relation_query(self) -> RelationQuery:
|
|
163
|
+
detected_entities = await self._get_detected_entities()
|
|
164
|
+
|
|
165
|
+
return RelationQuery(
|
|
166
|
+
entry_points=detected_entities, deleted_entity_groups=[], deleted_entities={}
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
async def _parse_graph_query(self) -> retrieval_models.GraphQuery:
|
|
170
|
+
if self.item.graph_query is None:
|
|
171
|
+
raise InvalidQueryError(
|
|
172
|
+
"graph_query", "Graph query must be provided when using graph search"
|
|
173
|
+
)
|
|
174
|
+
return retrieval_models.GraphQuery(query=self.item.graph_query)
|
|
175
|
+
|
|
176
|
+
async def _get_detected_entities(self) -> list[utils_pb2.RelationNode]:
|
|
177
|
+
"""Get entities from request, either automatically detected or
|
|
178
|
+
explicitly set by the user."""
|
|
179
|
+
|
|
180
|
+
if self.item.query_entities:
|
|
181
|
+
detected_entities = []
|
|
182
|
+
for entity in self.item.query_entities:
|
|
183
|
+
relation_node = utils_pb2.RelationNode()
|
|
184
|
+
relation_node.value = entity.name
|
|
185
|
+
if entity.type is not None:
|
|
186
|
+
relation_node.ntype = RelationNodeTypeMap[entity.type]
|
|
187
|
+
if entity.subtype is not None:
|
|
188
|
+
relation_node.subtype = entity.subtype
|
|
189
|
+
detected_entities.append(relation_node)
|
|
190
|
+
else:
|
|
191
|
+
detected_entities = await self.fetcher.get_detected_entities()
|
|
192
|
+
|
|
193
|
+
return detected_entities
|
|
194
|
+
|
|
195
|
+
async def _parse_filters(self) -> retrieval_models.Filters:
|
|
196
|
+
assert self._query is not None, "query must be parsed before filters"
|
|
197
|
+
|
|
198
|
+
# this is a conversion between /find filters to /retrieve filters. As
|
|
199
|
+
# /find keeps maintaining old filter style, we must convert from one to
|
|
200
|
+
# another
|
|
201
|
+
|
|
202
|
+
has_old_filters = (
|
|
203
|
+
len(self.item.filters) > 0
|
|
204
|
+
or len(self.item.resource_filters) > 0
|
|
205
|
+
or len(self.item.fields) > 0
|
|
206
|
+
or len(self.item.keyword_filters) > 0
|
|
207
|
+
or self.item.range_creation_start is not None
|
|
208
|
+
or self.item.range_creation_end is not None
|
|
209
|
+
or self.item.range_modification_start is not None
|
|
210
|
+
or self.item.range_modification_end is not None
|
|
211
|
+
)
|
|
212
|
+
if self.item.filter_expression is not None and has_old_filters:
|
|
213
|
+
raise InvalidQueryError("filter_expression", "Cannot mix old filters with filter_expression")
|
|
214
|
+
|
|
215
|
+
filter_expression = None
|
|
216
|
+
|
|
217
|
+
if has_old_filters:
|
|
218
|
+
# convert old filters into a filter expression
|
|
219
|
+
|
|
220
|
+
operator = FilterExpression.Operator.AND
|
|
221
|
+
field_expression: list[FieldFilterExpression] = []
|
|
222
|
+
paragraph_expression: list[ParagraphFilterExpression] = []
|
|
223
|
+
|
|
224
|
+
if self.item.range_creation_start or self.item.range_creation_end:
|
|
225
|
+
field_expression.append(
|
|
226
|
+
DateCreated(
|
|
227
|
+
since=self.item.range_creation_start,
|
|
228
|
+
until=self.item.range_creation_end,
|
|
229
|
+
)
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
if self.item.range_modification_start or self.item.range_modification_end:
|
|
233
|
+
field_expression.append(
|
|
234
|
+
DateModified(
|
|
235
|
+
since=self.item.range_modification_start,
|
|
236
|
+
until=self.item.range_modification_end,
|
|
237
|
+
)
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
if self.item.filters:
|
|
241
|
+
classification_labels = await self.fetcher.get_classification_labels()
|
|
242
|
+
field_exprs, paragraph_expr = convert_labels_to_filter_expressions(
|
|
243
|
+
self.item.filters, classification_labels
|
|
244
|
+
)
|
|
245
|
+
if field_exprs:
|
|
246
|
+
field_expression.extend(field_exprs)
|
|
247
|
+
if paragraph_expr:
|
|
248
|
+
paragraph_expression.append(paragraph_expr)
|
|
249
|
+
|
|
250
|
+
if self.item.keyword_filters:
|
|
251
|
+
# keyword filters
|
|
252
|
+
for keyword_filter in self.item.keyword_filters:
|
|
253
|
+
if isinstance(keyword_filter, str):
|
|
254
|
+
field_expression.append(Keyword(word=keyword_filter))
|
|
255
|
+
else:
|
|
256
|
+
# model validates that one and only one of these match
|
|
257
|
+
if keyword_filter.all:
|
|
258
|
+
field_expression.append(
|
|
259
|
+
And(operands=[Keyword(word=word) for word in keyword_filter.all])
|
|
260
|
+
)
|
|
261
|
+
elif keyword_filter.any:
|
|
262
|
+
field_expression.append(
|
|
263
|
+
Or(operands=[Keyword(word=word) for word in keyword_filter.any])
|
|
264
|
+
)
|
|
265
|
+
elif keyword_filter.none:
|
|
266
|
+
field_expression.append(
|
|
267
|
+
Not(
|
|
268
|
+
operand=Or(
|
|
269
|
+
operands=[Keyword(word=word) for word in keyword_filter.none]
|
|
270
|
+
)
|
|
271
|
+
)
|
|
272
|
+
)
|
|
273
|
+
elif keyword_filter.not_all:
|
|
274
|
+
field_expression.append(
|
|
275
|
+
Not(
|
|
276
|
+
operand=And(
|
|
277
|
+
operands=[Keyword(word=word) for word in keyword_filter.not_all]
|
|
278
|
+
)
|
|
279
|
+
)
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
if self.item.fields:
|
|
283
|
+
operands: list[FieldFilterExpression] = []
|
|
284
|
+
for key in self.item.fields:
|
|
285
|
+
parts = key.split("/")
|
|
286
|
+
try:
|
|
287
|
+
field_type = FieldTypeName.from_abbreviation(parts[0])
|
|
288
|
+
except KeyError: # pragma: no cover
|
|
289
|
+
raise InvalidQueryError(
|
|
290
|
+
"fields", f"field filter {key} has an invalid field type: {parts[0]}"
|
|
291
|
+
)
|
|
292
|
+
field_id = parts[1] if len(parts) > 1 else None
|
|
293
|
+
operands.append(Field(type=field_type, name=field_id))
|
|
294
|
+
|
|
295
|
+
if len(operands) == 1:
|
|
296
|
+
field_expression.append(operands[0])
|
|
297
|
+
elif len(operands) > 1:
|
|
298
|
+
field_expression.append(Or(operands=operands))
|
|
299
|
+
|
|
300
|
+
if self.item.resource_filters:
|
|
301
|
+
operands = []
|
|
302
|
+
for key in self.item.resource_filters:
|
|
303
|
+
parts = key.split("/")
|
|
304
|
+
if len(parts) == 1:
|
|
305
|
+
operands.append(Resource(id=parts[0]))
|
|
306
|
+
else:
|
|
307
|
+
rid = parts[0]
|
|
308
|
+
field_type = FieldTypeName.from_abbreviation(parts[1])
|
|
309
|
+
field_id = parts[2] if len(parts) > 2 else None
|
|
310
|
+
operands.append(
|
|
311
|
+
And(operands=[Resource(id=rid), Field(type=field_type, name=field_id)])
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
if len(operands) == 1:
|
|
315
|
+
field_expression.append(operands[0])
|
|
316
|
+
elif len(operands) > 1:
|
|
317
|
+
field_expression.append(Or(operands=operands))
|
|
318
|
+
|
|
319
|
+
field = None
|
|
320
|
+
if len(field_expression) == 1:
|
|
321
|
+
field = field_expression[0]
|
|
322
|
+
elif len(field_expression) > 1:
|
|
323
|
+
field = And(operands=field_expression)
|
|
324
|
+
|
|
325
|
+
paragraph = None
|
|
326
|
+
if len(paragraph_expression) == 1:
|
|
327
|
+
paragraph = paragraph_expression[0]
|
|
328
|
+
elif len(paragraph_expression) > 1:
|
|
329
|
+
paragraph = And(operands=paragraph_expression)
|
|
330
|
+
|
|
331
|
+
if field or paragraph:
|
|
332
|
+
filter_expression = FilterExpression(field=field, paragraph=paragraph, operator=operator)
|
|
333
|
+
|
|
334
|
+
if self.item.filter_expression is not None:
|
|
335
|
+
filter_expression = self.item.filter_expression
|
|
336
|
+
|
|
337
|
+
return retrieval_models.Filters(
|
|
338
|
+
filter_expression=filter_expression,
|
|
339
|
+
show_hidden=self.item.show_hidden,
|
|
340
|
+
security=self.item.security,
|
|
341
|
+
with_duplicates=self.item.with_duplicates,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
def _parse_reranker(self) -> Reranker:
|
|
345
|
+
reranker: Reranker
|
|
346
|
+
top_k = self.item.top_k
|
|
347
|
+
|
|
348
|
+
if isinstance(self.item.reranker, search_models.RerankerName):
|
|
349
|
+
if self.item.reranker == search_models.RerankerName.NOOP:
|
|
350
|
+
reranker = NoopReranker()
|
|
351
|
+
|
|
352
|
+
elif self.item.reranker == search_models.RerankerName.PREDICT_RERANKER:
|
|
353
|
+
# for predict rearnker, by default, we want a x2 factor with a
|
|
354
|
+
# top of 200 results
|
|
355
|
+
reranker = PredictReranker(window=min(top_k * 2, 200))
|
|
356
|
+
|
|
357
|
+
else:
|
|
358
|
+
raise InternalParserError(f"Unknown reranker algorithm: {self.item.reranker}")
|
|
359
|
+
|
|
360
|
+
elif isinstance(self.item.reranker, search_models.PredictReranker):
|
|
361
|
+
user_window = self.item.reranker.window
|
|
362
|
+
reranker = PredictReranker(window=min(max(user_window or 0, top_k), 200))
|
|
363
|
+
|
|
364
|
+
else:
|
|
365
|
+
raise InternalParserError(f"Unknown reranker {self.item.reranker}")
|
|
366
|
+
|
|
367
|
+
return reranker
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
async def parse_keyword_query(
|
|
371
|
+
item: search_models.BaseSearchRequest,
|
|
372
|
+
*,
|
|
373
|
+
fetcher: RAOFetcher,
|
|
374
|
+
) -> retrieval_models.KeywordQuery:
|
|
375
|
+
query = item.query
|
|
376
|
+
|
|
377
|
+
# If there was a rephrase with image, we should use the rephrased query for keyword search
|
|
378
|
+
rephrased_query = await fetcher.get_rephrased_query()
|
|
379
|
+
if item.query_image is not None and rephrased_query is not None:
|
|
380
|
+
query = rephrased_query
|
|
381
|
+
|
|
382
|
+
min_score = parse_keyword_min_score(item.min_score)
|
|
383
|
+
|
|
384
|
+
return retrieval_models.KeywordQuery(
|
|
385
|
+
query=query,
|
|
386
|
+
# Synonym checks are done at the retrieval endpoint already
|
|
387
|
+
with_synonyms=item.with_synonyms,
|
|
388
|
+
min_score=min_score,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
async def parse_semantic_query(
|
|
393
|
+
item: search_models.SearchRequest | search_models.FindRequest,
|
|
394
|
+
*,
|
|
395
|
+
fetcher: RAOFetcher,
|
|
396
|
+
) -> retrieval_models.SemanticQuery:
|
|
397
|
+
vectorset = await fetcher.get_vectorset()
|
|
398
|
+
query = await fetcher.get_query_vector()
|
|
399
|
+
|
|
400
|
+
min_score = await parse_semantic_min_score(item.min_score, fetcher=fetcher)
|
|
401
|
+
|
|
402
|
+
return retrieval_models.SemanticQuery(query=query, vectorset=vectorset, min_score=min_score)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
async def parse_semantic_min_score(
|
|
406
|
+
min_score: float | search_models.MinScore | None,
|
|
407
|
+
*,
|
|
408
|
+
fetcher: RAOFetcher,
|
|
409
|
+
) -> float:
|
|
410
|
+
if min_score is None:
|
|
411
|
+
min_score = None
|
|
412
|
+
elif isinstance(min_score, float):
|
|
413
|
+
min_score = min_score
|
|
414
|
+
else:
|
|
415
|
+
min_score = min_score.semantic
|
|
416
|
+
if min_score is None:
|
|
417
|
+
# min score not defined by the user, we'll try to get the default
|
|
418
|
+
# from Predict API
|
|
419
|
+
min_score = await fetcher.get_semantic_min_score()
|
|
420
|
+
if min_score is None:
|
|
421
|
+
logger.warning(
|
|
422
|
+
"Semantic threshold not found in query information, using default",
|
|
423
|
+
extra={"kbid": fetcher.kbid},
|
|
424
|
+
)
|
|
425
|
+
min_score = DEFAULT_GENERIC_SEMANTIC_THRESHOLD
|
|
426
|
+
|
|
427
|
+
return min_score
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def convert_labels_to_filter_expressions(
|
|
431
|
+
label_filters: list[str] | list[Filter], classification_labels: knowledgebox_pb2.Labels
|
|
432
|
+
) -> tuple[list[FieldFilterExpression], ParagraphFilterExpression | None]:
|
|
433
|
+
field_expressions: list[FieldFilterExpression] = []
|
|
434
|
+
paragraph_expressions: list[ParagraphFilterExpression] = []
|
|
435
|
+
|
|
436
|
+
for label_filter in label_filters:
|
|
437
|
+
if isinstance(label_filter, str):
|
|
438
|
+
# translate_label
|
|
439
|
+
if len(label_filter) == 0:
|
|
440
|
+
raise InvalidQueryError("filters", "Invalid empty label")
|
|
441
|
+
if label_filter[0] != "/":
|
|
442
|
+
raise InvalidQueryError(
|
|
443
|
+
"filters", f"Invalid label. It must start with a `/`: {label_filter}"
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
label = translate_label(label_filter)
|
|
447
|
+
facet_filter = filter_from_facet(label)
|
|
448
|
+
|
|
449
|
+
if is_paragraph_label(label, classification_labels):
|
|
450
|
+
paragraph_expressions.append(facet_filter) # type: ignore[arg-type]
|
|
451
|
+
else:
|
|
452
|
+
field_expressions.append(facet_filter) # type: ignore[arg-type]
|
|
453
|
+
|
|
454
|
+
else:
|
|
455
|
+
combinator: type[And[FieldFilterExpression]] | type[Or[FieldFilterExpression]]
|
|
456
|
+
if label_filter.all:
|
|
457
|
+
labels = label_filter.all
|
|
458
|
+
combinator, negate = And, False
|
|
459
|
+
elif label_filter.any:
|
|
460
|
+
labels = label_filter.any
|
|
461
|
+
combinator, negate = Or, False
|
|
462
|
+
elif label_filter.none:
|
|
463
|
+
labels = label_filter.none
|
|
464
|
+
combinator, negate = And, True
|
|
465
|
+
elif label_filter.not_all:
|
|
466
|
+
labels = label_filter.not_all
|
|
467
|
+
combinator, negate = Or, True
|
|
468
|
+
else:
|
|
469
|
+
# Empty filter, should not happen due to validation, but skip just in case
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
# equivalent to split_labels
|
|
473
|
+
field = []
|
|
474
|
+
paragraph = []
|
|
475
|
+
for label in labels:
|
|
476
|
+
label = translate_label(label)
|
|
477
|
+
expr = filter_from_facet(label)
|
|
478
|
+
|
|
479
|
+
if negate:
|
|
480
|
+
expr = Not(operand=expr) # type: ignore
|
|
481
|
+
|
|
482
|
+
if is_paragraph_label(label, classification_labels):
|
|
483
|
+
paragraph.append(expr)
|
|
484
|
+
else:
|
|
485
|
+
field.append(expr)
|
|
486
|
+
|
|
487
|
+
if len(paragraph) > 0 and not (combinator == And and negate is False):
|
|
488
|
+
raise InvalidQueryError(
|
|
489
|
+
"filters",
|
|
490
|
+
"Paragraph labels can only be used with 'all' filter",
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
if len(field) == 1:
|
|
494
|
+
field_expressions.append(field[0]) # type: ignore
|
|
495
|
+
elif len(field) > 1:
|
|
496
|
+
field_expressions.append(combinator(operands=field)) # type: ignore
|
|
497
|
+
|
|
498
|
+
if len(paragraph) == 1:
|
|
499
|
+
paragraph_expressions.append(paragraph[0]) # type: ignore
|
|
500
|
+
elif len(paragraph) > 1:
|
|
501
|
+
paragraph_expressions.append(combinator(operands=paragraph)) # type: ignore
|
|
502
|
+
|
|
503
|
+
if len(paragraph_expressions) == 1:
|
|
504
|
+
paragraph_expression = paragraph_expressions[0] # type: ignore
|
|
505
|
+
elif len(paragraph_expressions) > 1:
|
|
506
|
+
paragraph_expression = And(operands=paragraph_expressions) # type: ignore
|
|
507
|
+
else:
|
|
508
|
+
paragraph_expression = None
|
|
509
|
+
|
|
510
|
+
return field_expressions, paragraph_expression
|