nucliadb 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0028_extracted_vectors_reference.py +61 -0
- migrations/0029_backfill_field_status.py +149 -0
- migrations/0030_label_deduplication.py +60 -0
- nucliadb/common/cluster/manager.py +41 -331
- nucliadb/common/cluster/rebalance.py +2 -2
- nucliadb/common/cluster/rollover.py +12 -71
- nucliadb/common/cluster/settings.py +3 -0
- nucliadb/common/cluster/standalone/utils.py +0 -43
- nucliadb/common/cluster/utils.py +0 -16
- nucliadb/common/counters.py +1 -0
- nucliadb/common/datamanagers/fields.py +48 -7
- nucliadb/common/datamanagers/vectorsets.py +11 -2
- nucliadb/common/external_index_providers/base.py +2 -1
- nucliadb/common/external_index_providers/pinecone.py +3 -5
- nucliadb/common/ids.py +18 -4
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +76 -37
- nucliadb/export_import/models.py +3 -3
- nucliadb/health.py +0 -7
- nucliadb/ingest/app.py +0 -8
- nucliadb/ingest/consumer/auditing.py +1 -1
- nucliadb/ingest/consumer/shard_creator.py +1 -1
- nucliadb/ingest/fields/base.py +83 -21
- nucliadb/ingest/orm/brain.py +55 -56
- nucliadb/ingest/orm/broker_message.py +12 -2
- nucliadb/ingest/orm/entities.py +6 -17
- nucliadb/ingest/orm/knowledgebox.py +44 -22
- nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
- nucliadb/ingest/orm/processor/processor.py +5 -2
- nucliadb/ingest/orm/resource.py +222 -413
- nucliadb/ingest/processing.py +8 -2
- nucliadb/ingest/serialize.py +77 -46
- nucliadb/ingest/service/writer.py +2 -56
- nucliadb/ingest/settings.py +1 -4
- nucliadb/learning_proxy.py +6 -4
- nucliadb/purge/__init__.py +102 -12
- nucliadb/purge/orphan_shards.py +6 -4
- nucliadb/reader/api/models.py +3 -3
- nucliadb/reader/api/v1/__init__.py +1 -0
- nucliadb/reader/api/v1/download.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +3 -3
- nucliadb/reader/api/v1/resource.py +23 -12
- nucliadb/reader/api/v1/services.py +4 -4
- nucliadb/reader/api/v1/vectorsets.py +48 -0
- nucliadb/search/api/v1/ask.py +11 -1
- nucliadb/search/api/v1/feedback.py +3 -3
- nucliadb/search/api/v1/knowledgebox.py +8 -13
- nucliadb/search/api/v1/search.py +3 -2
- nucliadb/search/api/v1/suggest.py +0 -2
- nucliadb/search/predict.py +6 -4
- nucliadb/search/requesters/utils.py +1 -2
- nucliadb/search/search/chat/ask.py +77 -13
- nucliadb/search/search/chat/prompt.py +16 -5
- nucliadb/search/search/chat/query.py +74 -34
- nucliadb/search/search/exceptions.py +2 -7
- nucliadb/search/search/find.py +9 -5
- nucliadb/search/search/find_merge.py +10 -4
- nucliadb/search/search/graph_strategy.py +884 -0
- nucliadb/search/search/hydrator.py +6 -0
- nucliadb/search/search/merge.py +79 -24
- nucliadb/search/search/query.py +74 -245
- nucliadb/search/search/query_parser/exceptions.py +11 -1
- nucliadb/search/search/query_parser/fetcher.py +405 -0
- nucliadb/search/search/query_parser/models.py +0 -3
- nucliadb/search/search/query_parser/parser.py +22 -21
- nucliadb/search/search/rerankers.py +1 -42
- nucliadb/search/search/shards.py +19 -0
- nucliadb/standalone/api_router.py +2 -14
- nucliadb/standalone/settings.py +4 -0
- nucliadb/train/generators/field_streaming.py +7 -3
- nucliadb/train/lifecycle.py +3 -6
- nucliadb/train/nodes.py +14 -12
- nucliadb/train/resource.py +380 -0
- nucliadb/writer/api/constants.py +20 -16
- nucliadb/writer/api/v1/__init__.py +1 -0
- nucliadb/writer/api/v1/export_import.py +1 -1
- nucliadb/writer/api/v1/field.py +13 -7
- nucliadb/writer/api/v1/knowledgebox.py +3 -46
- nucliadb/writer/api/v1/resource.py +20 -13
- nucliadb/writer/api/v1/services.py +10 -1
- nucliadb/writer/api/v1/upload.py +61 -34
- nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
- nucliadb/writer/back_pressure.py +17 -46
- nucliadb/writer/resource/basic.py +9 -7
- nucliadb/writer/resource/field.py +42 -9
- nucliadb/writer/settings.py +2 -2
- nucliadb/writer/tus/gcs.py +11 -10
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
- nucliadb/common/cluster/discovery/base.py +0 -178
- nucliadb/common/cluster/discovery/k8s.py +0 -301
- nucliadb/common/cluster/discovery/manual.py +0 -57
- nucliadb/common/cluster/discovery/single.py +0 -51
- nucliadb/common/cluster/discovery/types.py +0 -32
- nucliadb/common/cluster/discovery/utils.py +0 -67
- nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
- nucliadb/common/cluster/standalone/index_node.py +0 -123
- nucliadb/common/cluster/standalone/service.py +0 -84
- nucliadb/standalone/introspect.py +0 -208
- nucliadb-6.2.0.post2679.dist-info/zip-safe +0 -1
- /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
nucliadb/search/search/query.py
CHANGED
@@ -23,12 +23,10 @@ import string
|
|
23
23
|
from datetime import datetime
|
24
24
|
from typing import Any, Awaitable, Optional, Union
|
25
25
|
|
26
|
-
from async_lru import alru_cache
|
27
|
-
|
28
26
|
from nucliadb.common import datamanagers
|
29
|
-
from nucliadb.common.
|
27
|
+
from nucliadb.common.models_utils.from_proto import RelationNodeTypeMap
|
30
28
|
from nucliadb.search import logger
|
31
|
-
from nucliadb.search.predict import SendToPredictError
|
29
|
+
from nucliadb.search.predict import SendToPredictError
|
32
30
|
from nucliadb.search.search.filters import (
|
33
31
|
convert_to_node_filters,
|
34
32
|
flatten_filter_literals,
|
@@ -39,32 +37,31 @@ from nucliadb.search.search.filters import (
|
|
39
37
|
)
|
40
38
|
from nucliadb.search.search.metrics import (
|
41
39
|
node_features,
|
42
|
-
query_parse_dependency_observer,
|
43
40
|
)
|
41
|
+
from nucliadb.search.search.query_parser.fetcher import Fetcher, get_classification_labels
|
44
42
|
from nucliadb.search.search.rank_fusion import (
|
45
43
|
RankFusionAlgorithm,
|
46
44
|
)
|
47
45
|
from nucliadb.search.search.rerankers import (
|
48
46
|
Reranker,
|
49
47
|
)
|
50
|
-
from nucliadb.search.utilities import get_predict
|
51
48
|
from nucliadb_models.internal.predict import QueryInfo
|
52
49
|
from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
|
53
50
|
from nucliadb_models.metadata import ResourceProcessingStatus
|
54
51
|
from nucliadb_models.search import (
|
55
52
|
Filter,
|
53
|
+
KnowledgeGraphEntity,
|
56
54
|
MaxTokens,
|
57
55
|
MinScore,
|
58
56
|
SearchOptions,
|
59
57
|
SortField,
|
60
|
-
SortFieldMap,
|
61
58
|
SortOptions,
|
62
59
|
SortOrder,
|
63
60
|
SortOrderMap,
|
64
61
|
SuggestOptions,
|
65
62
|
)
|
66
63
|
from nucliadb_models.security import RequestSecurity
|
67
|
-
from nucliadb_protos import
|
64
|
+
from nucliadb_protos import nodereader_pb2, utils_pb2
|
68
65
|
from nucliadb_protos.noderesources_pb2 import Resource
|
69
66
|
|
70
67
|
from .exceptions import InvalidQueryError
|
@@ -88,13 +85,6 @@ class QueryParser:
|
|
88
85
|
"""
|
89
86
|
|
90
87
|
_query_information_task: Optional[asyncio.Task] = None
|
91
|
-
_get_vectorset_task: Optional[asyncio.Task] = None
|
92
|
-
_detected_entities_task: Optional[asyncio.Task] = None
|
93
|
-
_entities_meta_cache_task: Optional[asyncio.Task] = None
|
94
|
-
_deleted_entities_groups_task: Optional[asyncio.Task] = None
|
95
|
-
_synonyms_task: Optional[asyncio.Task] = None
|
96
|
-
_get_classification_labels_task: Optional[asyncio.Task] = None
|
97
|
-
_get_matryoshka_dimension_task: Optional[asyncio.Task] = None
|
98
88
|
|
99
89
|
def __init__(
|
100
90
|
self,
|
@@ -106,6 +96,7 @@ class QueryParser:
|
|
106
96
|
keyword_filters: Union[list[str], list[Filter]],
|
107
97
|
top_k: int,
|
108
98
|
min_score: MinScore,
|
99
|
+
query_entities: Optional[list[KnowledgeGraphEntity]] = None,
|
109
100
|
faceted: Optional[list[str]] = None,
|
110
101
|
sort: Optional[SortOptions] = None,
|
111
102
|
range_creation_start: Optional[datetime] = None,
|
@@ -132,6 +123,7 @@ class QueryParser:
|
|
132
123
|
self.kbid = kbid
|
133
124
|
self.features = features
|
134
125
|
self.query = query
|
126
|
+
self.query_entities = query_entities
|
135
127
|
self.hidden = hidden
|
136
128
|
if self.hidden is not None:
|
137
129
|
if self.hidden:
|
@@ -169,6 +161,15 @@ class QueryParser:
|
|
169
161
|
self.max_tokens = max_tokens
|
170
162
|
self.rank_fusion = rank_fusion
|
171
163
|
self.reranker = reranker
|
164
|
+
self.fetcher = Fetcher(
|
165
|
+
kbid=kbid,
|
166
|
+
query=query,
|
167
|
+
user_vector=user_vector,
|
168
|
+
vectorset=vectorset,
|
169
|
+
rephrase=rephrase,
|
170
|
+
rephrase_prompt=rephrase_prompt,
|
171
|
+
generative_model=generative_model,
|
172
|
+
)
|
172
173
|
|
173
174
|
@property
|
174
175
|
def has_vector_search(self) -> bool:
|
@@ -184,78 +185,12 @@ class QueryParser:
|
|
184
185
|
return self._query_information_task
|
185
186
|
|
186
187
|
async def _query_information(self) -> QueryInfo:
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
if self._get_vectorset_task is None:
|
194
|
-
self._get_vectorset_task = asyncio.create_task(self._select_vectorset())
|
195
|
-
return self._get_vectorset_task
|
196
|
-
|
197
|
-
async def _select_vectorset(self) -> Optional[str]:
|
198
|
-
if self.vectorset:
|
199
|
-
return self.vectorset
|
200
|
-
|
201
|
-
# When vectorset is not provided we get the default from Predict API
|
202
|
-
|
203
|
-
try:
|
204
|
-
query_information = await self._get_query_information()
|
205
|
-
except SendToPredictError:
|
206
|
-
return None
|
207
|
-
|
208
|
-
if query_information.sentence is None:
|
209
|
-
logger.error(
|
210
|
-
"Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
|
211
|
-
)
|
212
|
-
return None
|
213
|
-
|
214
|
-
for vectorset in query_information.sentence.vectors.keys():
|
215
|
-
self.vectorset = vectorset
|
216
|
-
break
|
217
|
-
|
218
|
-
return self.vectorset
|
219
|
-
|
220
|
-
def _get_matryoshka_dimension(self) -> Awaitable[Optional[int]]:
|
221
|
-
if self._get_matryoshka_dimension_task is None:
|
222
|
-
self._get_matryoshka_dimension_task = asyncio.create_task(self._matryoshka_dimension())
|
223
|
-
return self._get_matryoshka_dimension_task
|
224
|
-
|
225
|
-
async def _matryoshka_dimension(self) -> Optional[int]:
|
226
|
-
vectorset = await self._select_vectorset()
|
227
|
-
return await get_matryoshka_dimension_cached(self.kbid, vectorset)
|
228
|
-
|
229
|
-
def _get_detected_entities(self) -> Awaitable[list[utils_pb2.RelationNode]]:
|
230
|
-
if self._detected_entities_task is None: # pragma: no cover
|
231
|
-
self._detected_entities_task = asyncio.create_task(detect_entities(self.kbid, self.query))
|
232
|
-
return self._detected_entities_task
|
233
|
-
|
234
|
-
def _get_entities_meta_cache(
|
235
|
-
self,
|
236
|
-
) -> Awaitable[datamanagers.entities.EntitiesMetaCache]:
|
237
|
-
if self._entities_meta_cache_task is None:
|
238
|
-
self._entities_meta_cache_task = asyncio.create_task(get_entities_meta_cache(self.kbid))
|
239
|
-
return self._entities_meta_cache_task
|
240
|
-
|
241
|
-
def _get_deleted_entity_groups(self) -> Awaitable[list[str]]:
|
242
|
-
if self._deleted_entities_groups_task is None:
|
243
|
-
self._deleted_entities_groups_task = asyncio.create_task(
|
244
|
-
get_deleted_entity_groups(self.kbid)
|
245
|
-
)
|
246
|
-
return self._deleted_entities_groups_task
|
247
|
-
|
248
|
-
def _get_synomyns(self) -> Awaitable[Optional[knowledgebox_pb2.Synonyms]]:
|
249
|
-
if self._synonyms_task is None:
|
250
|
-
self._synonyms_task = asyncio.create_task(get_kb_synonyms(self.kbid))
|
251
|
-
return self._synonyms_task
|
252
|
-
|
253
|
-
def _get_classification_labels(self) -> Awaitable[knowledgebox_pb2.Labels]:
|
254
|
-
if self._get_classification_labels_task is None:
|
255
|
-
self._get_classification_labels_task = asyncio.create_task(
|
256
|
-
get_classification_labels(self.kbid)
|
257
|
-
)
|
258
|
-
return self._get_classification_labels_task
|
188
|
+
# HACK: while transitioning to the new query parser, use fetcher under
|
189
|
+
# the hood for a smoother migration
|
190
|
+
query_info = await self.fetcher._predict_query_endpoint()
|
191
|
+
if query_info is None:
|
192
|
+
raise SendToPredictError("Error while using predict's query endpoint")
|
193
|
+
return query_info
|
259
194
|
|
260
195
|
async def _schedule_dependency_tasks(self) -> None:
|
261
196
|
"""
|
@@ -263,23 +198,24 @@ class QueryParser:
|
|
263
198
|
for the sake of the query being performed
|
264
199
|
"""
|
265
200
|
if len(self.label_filters) > 0 and has_classification_label_filters(self.flat_label_filters):
|
266
|
-
asyncio.ensure_future(self.
|
201
|
+
asyncio.ensure_future(self.fetcher.get_classification_labels())
|
267
202
|
|
268
203
|
if self.has_vector_search and self.user_vector is None:
|
269
204
|
self.query_endpoint_used = True
|
270
205
|
asyncio.ensure_future(self._get_query_information())
|
271
|
-
|
206
|
+
# XXX: should we also ensure get_vectorset and get_query_vector?
|
207
|
+
asyncio.ensure_future(self.fetcher.get_matryoshka_dimension())
|
272
208
|
|
273
209
|
if (self.has_relations_search or self.autofilter) and len(self.query) > 0:
|
274
210
|
if not self.query_endpoint_used:
|
275
211
|
# If we only need to detect entities, we don't need the query endpoint
|
276
|
-
asyncio.ensure_future(self.
|
277
|
-
asyncio.ensure_future(self.
|
278
|
-
asyncio.ensure_future(self.
|
212
|
+
asyncio.ensure_future(self.fetcher.get_detected_entities())
|
213
|
+
asyncio.ensure_future(self.fetcher.get_entities_meta_cache())
|
214
|
+
asyncio.ensure_future(self.fetcher.get_deleted_entity_groups())
|
279
215
|
if self.with_synonyms and self.query:
|
280
|
-
asyncio.ensure_future(self.
|
216
|
+
asyncio.ensure_future(self.fetcher.get_synonyms())
|
281
217
|
|
282
|
-
async def parse(self) -> tuple[nodereader_pb2.SearchRequest, bool, list[str]]:
|
218
|
+
async def parse(self) -> tuple[nodereader_pb2.SearchRequest, bool, list[str], Optional[str]]:
|
283
219
|
"""
|
284
220
|
:return: (request, incomplete, autofilters)
|
285
221
|
where:
|
@@ -298,19 +234,20 @@ class QueryParser:
|
|
298
234
|
await self.parse_filters(request)
|
299
235
|
self.parse_document_search(request)
|
300
236
|
self.parse_paragraph_search(request)
|
301
|
-
incomplete = await self.parse_vector_search(request)
|
237
|
+
incomplete, rephrased_query = await self.parse_vector_search(request)
|
238
|
+
# BUG: autofilters are not used to filter, but we say we do
|
302
239
|
autofilters = await self.parse_relation_search(request)
|
303
240
|
await self.parse_synonyms(request)
|
304
241
|
await self.parse_min_score(request, incomplete)
|
305
242
|
await self.adjust_page_size(request, self.rank_fusion, self.reranker)
|
306
|
-
return request, incomplete, autofilters
|
243
|
+
return request, incomplete, autofilters, rephrased_query
|
307
244
|
|
308
245
|
async def parse_filters(self, request: nodereader_pb2.SearchRequest) -> None:
|
309
246
|
if len(self.label_filters) > 0:
|
310
247
|
field_labels = self.flat_label_filters
|
311
248
|
paragraph_labels: list[str] = []
|
312
249
|
if has_classification_label_filters(self.flat_label_filters):
|
313
|
-
classification_labels = await self.
|
250
|
+
classification_labels = await self.fetcher.get_classification_labels()
|
314
251
|
field_labels, paragraph_labels = split_labels_by_type(
|
315
252
|
self.flat_label_filters, classification_labels
|
316
253
|
)
|
@@ -388,7 +325,7 @@ class QueryParser:
|
|
388
325
|
else:
|
389
326
|
request.result_per_page = self.top_k
|
390
327
|
|
391
|
-
sort_field =
|
328
|
+
sort_field = get_sort_field_proto(self.sort.field) if self.sort else None
|
392
329
|
if sort_field is not None:
|
393
330
|
request.order.sort_by = sort_field
|
394
331
|
request.order.type = SortOrderMap[self.sort.order] # type: ignore
|
@@ -399,19 +336,13 @@ class QueryParser:
|
|
399
336
|
semantic_min_score = self.min_score.semantic
|
400
337
|
elif self.has_vector_search and not incomplete:
|
401
338
|
query_information = await self._get_query_information()
|
402
|
-
vectorset = await self.
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
semantic_min_score = semantic_threshold
|
407
|
-
else:
|
408
|
-
logger.warning(
|
409
|
-
"Semantic threshold not found in query information, using default",
|
410
|
-
extra={"kbid": self.kbid},
|
411
|
-
)
|
339
|
+
vectorset = await self.fetcher.get_vectorset()
|
340
|
+
semantic_threshold = query_information.semantic_thresholds.get(vectorset, None)
|
341
|
+
if semantic_threshold is not None:
|
342
|
+
semantic_min_score = semantic_threshold
|
412
343
|
else:
|
413
344
|
logger.warning(
|
414
|
-
"
|
345
|
+
"Semantic threshold not found in query information, using default",
|
415
346
|
extra={"kbid": self.kbid},
|
416
347
|
)
|
417
348
|
self.min_score.semantic = semantic_min_score
|
@@ -428,91 +359,49 @@ class QueryParser:
|
|
428
359
|
request.paragraph = True
|
429
360
|
node_features.inc({"type": "paragraphs"})
|
430
361
|
|
431
|
-
async def
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
"""
|
436
|
-
if not self.vectorset:
|
437
|
-
return None
|
438
|
-
|
439
|
-
# validate vectorset
|
440
|
-
async with datamanagers.with_ro_transaction() as txn:
|
441
|
-
if not await datamanagers.vectorsets.exists(
|
442
|
-
txn, kbid=self.kbid, vectorset_id=self.vectorset
|
443
|
-
):
|
444
|
-
raise InvalidQueryError(
|
445
|
-
"vectorset",
|
446
|
-
f"Vectorset {self.vectorset} doesn't exist in you Knowledge Box",
|
447
|
-
)
|
448
|
-
return self.vectorset
|
449
|
-
|
450
|
-
async def parse_vector_search(self, request: nodereader_pb2.SearchRequest) -> bool:
|
362
|
+
async def parse_vector_search(
|
363
|
+
self, request: nodereader_pb2.SearchRequest
|
364
|
+
) -> tuple[bool, Optional[str]]:
|
451
365
|
if not self.has_vector_search:
|
452
|
-
return False
|
366
|
+
return False, None
|
453
367
|
|
454
368
|
node_features.inc({"type": "vectors"})
|
455
369
|
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
request.vectorset = vectorset
|
461
|
-
|
462
|
-
query_vector = None
|
463
|
-
if self.user_vector is None:
|
464
|
-
try:
|
465
|
-
query_info = await self._get_query_information()
|
466
|
-
except SendToPredictError as err:
|
467
|
-
logger.warning(f"Errors on predict api trying to embedd query: {err}")
|
468
|
-
incomplete = True
|
469
|
-
else:
|
470
|
-
if query_info and query_info.sentence:
|
471
|
-
if vectorset:
|
472
|
-
if vectorset in query_info.sentence.vectors:
|
473
|
-
query_vector = query_info.sentence.vectors[vectorset]
|
474
|
-
else:
|
475
|
-
incomplete = True
|
476
|
-
else:
|
477
|
-
for vectorset_id, vector in query_info.sentence.vectors.items():
|
478
|
-
if vector:
|
479
|
-
query_vector = vector
|
480
|
-
break
|
481
|
-
else:
|
482
|
-
incomplete = True
|
483
|
-
|
484
|
-
else:
|
485
|
-
incomplete = True
|
486
|
-
else:
|
487
|
-
query_vector = self.user_vector
|
370
|
+
vectorset = await self.fetcher.get_vectorset()
|
371
|
+
query_vector = await self.fetcher.get_query_vector()
|
372
|
+
rephrased_query = await self.fetcher.get_rephrased_query()
|
373
|
+
incomplete = query_vector is None
|
488
374
|
|
375
|
+
request.vectorset = vectorset
|
489
376
|
if query_vector is not None:
|
490
|
-
matryoshka_dimension = await self._get_matryoshka_dimension()
|
491
|
-
if matryoshka_dimension is not None:
|
492
|
-
# KB using a matryoshka embeddings model, cut the query vector
|
493
|
-
# accordingly
|
494
|
-
query_vector = query_vector[:matryoshka_dimension]
|
495
377
|
request.vector.extend(query_vector)
|
496
378
|
|
497
|
-
return incomplete
|
379
|
+
return incomplete, rephrased_query
|
498
380
|
|
499
381
|
async def parse_relation_search(self, request: nodereader_pb2.SearchRequest) -> list[str]:
|
500
382
|
autofilters = []
|
383
|
+
# BUG: autofiler should autofilter, not enable relation search
|
501
384
|
if self.has_relations_search or self.autofilter:
|
502
|
-
if
|
503
|
-
detected_entities =
|
385
|
+
if self.query_entities:
|
386
|
+
detected_entities = []
|
387
|
+
for entity in self.query_entities:
|
388
|
+
relation_node = utils_pb2.RelationNode()
|
389
|
+
relation_node.value = entity.name
|
390
|
+
if entity.type is not None:
|
391
|
+
relation_node.ntype = RelationNodeTypeMap[entity.type]
|
392
|
+
if entity.subtype is not None:
|
393
|
+
relation_node.subtype = entity.subtype
|
394
|
+
detected_entities.append(relation_node)
|
504
395
|
else:
|
505
|
-
|
506
|
-
|
507
|
-
detected_entities = convert_relations(query_info_result.entities.model_dump())
|
508
|
-
else:
|
509
|
-
detected_entities = []
|
510
|
-
meta_cache = await self._get_entities_meta_cache()
|
396
|
+
detected_entities = await self.fetcher.get_detected_entities()
|
397
|
+
meta_cache = await self.fetcher.get_entities_meta_cache()
|
511
398
|
detected_entities = expand_entities(meta_cache, detected_entities)
|
512
399
|
if self.has_relations_search:
|
513
400
|
request.relation_subgraph.entry_points.extend(detected_entities)
|
514
401
|
request.relation_subgraph.depth = 1
|
515
|
-
request.relation_subgraph.deleted_groups.extend(
|
402
|
+
request.relation_subgraph.deleted_groups.extend(
|
403
|
+
await self.fetcher.get_deleted_entity_groups()
|
404
|
+
)
|
516
405
|
for group_id, deleted_entities in meta_cache.deleted_entities.items():
|
517
406
|
request.relation_subgraph.deleted_entities.append(
|
518
407
|
nodereader_pb2.EntitiesSubgraphRequest.DeletedEntities(
|
@@ -545,7 +434,7 @@ class QueryParser:
|
|
545
434
|
"Search with custom synonyms is only supported on paragraph and document search",
|
546
435
|
)
|
547
436
|
|
548
|
-
synonyms = await self.
|
437
|
+
synonyms = await self.fetcher.get_synonyms()
|
549
438
|
if synonyms is None:
|
550
439
|
# No synonyms found
|
551
440
|
return
|
@@ -681,29 +570,6 @@ async def paragraph_query_to_pb(
|
|
681
570
|
return request
|
682
571
|
|
683
572
|
|
684
|
-
@query_parse_dependency_observer.wrap({"type": "query_information"})
|
685
|
-
async def query_information(
|
686
|
-
kbid: str,
|
687
|
-
query: str,
|
688
|
-
semantic_model: Optional[str],
|
689
|
-
generative_model: Optional[str] = None,
|
690
|
-
rephrase: bool = False,
|
691
|
-
rephrase_prompt: Optional[str] = None,
|
692
|
-
) -> QueryInfo:
|
693
|
-
predict = get_predict()
|
694
|
-
return await predict.query(kbid, query, semantic_model, generative_model, rephrase, rephrase_prompt)
|
695
|
-
|
696
|
-
|
697
|
-
@query_parse_dependency_observer.wrap({"type": "detect_entities"})
|
698
|
-
async def detect_entities(kbid: str, query: str) -> list[utils_pb2.RelationNode]:
|
699
|
-
predict = get_predict()
|
700
|
-
try:
|
701
|
-
return await predict.detect_entities(kbid, query)
|
702
|
-
except SendToPredictError as ex:
|
703
|
-
logger.warning(f"Errors on predict api detecting entities: {ex}")
|
704
|
-
return []
|
705
|
-
|
706
|
-
|
707
573
|
def expand_entities(
|
708
574
|
meta_cache: datamanagers.entities.EntitiesMetaCache,
|
709
575
|
detected_entities: list[utils_pb2.RelationNode],
|
@@ -834,30 +700,6 @@ PROCESSING_STATUS_TO_PB_MAP = {
|
|
834
700
|
}
|
835
701
|
|
836
702
|
|
837
|
-
@query_parse_dependency_observer.wrap({"type": "synonyms"})
|
838
|
-
async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
|
839
|
-
async with get_driver().transaction(read_only=True) as txn:
|
840
|
-
return await datamanagers.synonyms.get(txn, kbid=kbid)
|
841
|
-
|
842
|
-
|
843
|
-
@query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
|
844
|
-
async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
|
845
|
-
async with get_driver().transaction(read_only=True) as txn:
|
846
|
-
return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
|
847
|
-
|
848
|
-
|
849
|
-
@query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
|
850
|
-
async def get_deleted_entity_groups(kbid: str) -> list[str]:
|
851
|
-
async with get_driver().transaction(read_only=True) as txn:
|
852
|
-
return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)
|
853
|
-
|
854
|
-
|
855
|
-
@query_parse_dependency_observer.wrap({"type": "classification_labels"})
|
856
|
-
async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
|
857
|
-
async with get_driver().transaction(read_only=True) as txn:
|
858
|
-
return await datamanagers.labels.get_labels(txn, kbid=kbid)
|
859
|
-
|
860
|
-
|
861
703
|
def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]):
|
862
704
|
"""
|
863
705
|
Check if the provided filters are supported:
|
@@ -890,23 +732,10 @@ def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]
|
|
890
732
|
)
|
891
733
|
|
892
734
|
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
|
901
|
-
async with get_driver().transaction(read_only=True) as txn:
|
902
|
-
matryoshka_dimension = None
|
903
|
-
if not vectorset:
|
904
|
-
# XXX this should be migrated once we remove the "default" vectorset
|
905
|
-
# concept
|
906
|
-
matryoshka_dimension = await datamanagers.kb.get_matryoshka_vector_dimension(txn, kbid=kbid)
|
907
|
-
else:
|
908
|
-
vectorset_config = await datamanagers.vectorsets.get(txn, kbid=kbid, vectorset_id=vectorset)
|
909
|
-
if vectorset_config is not None and vectorset_config.vectorset_index_config.vector_dimension:
|
910
|
-
matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension
|
911
|
-
|
912
|
-
return matryoshka_dimension
|
735
|
+
def get_sort_field_proto(obj: SortField) -> Optional[nodereader_pb2.OrderBy.OrderField.ValueType]:
|
736
|
+
return {
|
737
|
+
SortField.SCORE: None,
|
738
|
+
SortField.CREATED: nodereader_pb2.OrderBy.OrderField.CREATED,
|
739
|
+
SortField.MODIFIED: nodereader_pb2.OrderBy.OrderField.MODIFIED,
|
740
|
+
SortField.TITLE: None,
|
741
|
+
}[obj]
|
@@ -19,4 +19,14 @@
|
|
19
19
|
#
|
20
20
|
|
21
21
|
|
22
|
-
class
|
22
|
+
class InternalParserError(ValueError):
|
23
|
+
"""Raised when parsing fails due to some internal error"""
|
24
|
+
|
25
|
+
|
26
|
+
class InvalidQueryError(Exception):
|
27
|
+
"""Raised when parsing a query containing an invalid parameter"""
|
28
|
+
|
29
|
+
def __init__(self, param: str, reason: str):
|
30
|
+
self.param = param
|
31
|
+
self.reason = reason
|
32
|
+
super().__init__(f"Invalid query. Error in {param}: {reason}")
|