nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. migrations/0028_extracted_vectors_reference.py +61 -0
  2. migrations/0029_backfill_field_status.py +149 -0
  3. migrations/0030_label_deduplication.py +60 -0
  4. nucliadb/common/cluster/manager.py +41 -331
  5. nucliadb/common/cluster/rebalance.py +2 -2
  6. nucliadb/common/cluster/rollover.py +12 -71
  7. nucliadb/common/cluster/settings.py +3 -0
  8. nucliadb/common/cluster/standalone/utils.py +0 -43
  9. nucliadb/common/cluster/utils.py +0 -16
  10. nucliadb/common/counters.py +1 -0
  11. nucliadb/common/datamanagers/fields.py +48 -7
  12. nucliadb/common/datamanagers/vectorsets.py +11 -2
  13. nucliadb/common/external_index_providers/base.py +2 -1
  14. nucliadb/common/external_index_providers/pinecone.py +3 -5
  15. nucliadb/common/ids.py +18 -4
  16. nucliadb/common/models_utils/from_proto.py +479 -0
  17. nucliadb/common/models_utils/to_proto.py +60 -0
  18. nucliadb/common/nidx.py +76 -37
  19. nucliadb/export_import/models.py +3 -3
  20. nucliadb/health.py +0 -7
  21. nucliadb/ingest/app.py +0 -8
  22. nucliadb/ingest/consumer/auditing.py +1 -1
  23. nucliadb/ingest/consumer/shard_creator.py +1 -1
  24. nucliadb/ingest/fields/base.py +83 -21
  25. nucliadb/ingest/orm/brain.py +55 -56
  26. nucliadb/ingest/orm/broker_message.py +12 -2
  27. nucliadb/ingest/orm/entities.py +6 -17
  28. nucliadb/ingest/orm/knowledgebox.py +44 -22
  29. nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
  30. nucliadb/ingest/orm/processor/processor.py +5 -2
  31. nucliadb/ingest/orm/resource.py +222 -413
  32. nucliadb/ingest/processing.py +8 -2
  33. nucliadb/ingest/serialize.py +77 -46
  34. nucliadb/ingest/service/writer.py +2 -56
  35. nucliadb/ingest/settings.py +1 -4
  36. nucliadb/learning_proxy.py +6 -4
  37. nucliadb/purge/__init__.py +102 -12
  38. nucliadb/purge/orphan_shards.py +6 -4
  39. nucliadb/reader/api/models.py +3 -3
  40. nucliadb/reader/api/v1/__init__.py +1 -0
  41. nucliadb/reader/api/v1/download.py +2 -2
  42. nucliadb/reader/api/v1/knowledgebox.py +3 -3
  43. nucliadb/reader/api/v1/resource.py +23 -12
  44. nucliadb/reader/api/v1/services.py +4 -4
  45. nucliadb/reader/api/v1/vectorsets.py +48 -0
  46. nucliadb/search/api/v1/ask.py +11 -1
  47. nucliadb/search/api/v1/feedback.py +3 -3
  48. nucliadb/search/api/v1/knowledgebox.py +8 -13
  49. nucliadb/search/api/v1/search.py +3 -2
  50. nucliadb/search/api/v1/suggest.py +0 -2
  51. nucliadb/search/predict.py +6 -4
  52. nucliadb/search/requesters/utils.py +1 -2
  53. nucliadb/search/search/chat/ask.py +77 -13
  54. nucliadb/search/search/chat/prompt.py +16 -5
  55. nucliadb/search/search/chat/query.py +74 -34
  56. nucliadb/search/search/exceptions.py +2 -7
  57. nucliadb/search/search/find.py +9 -5
  58. nucliadb/search/search/find_merge.py +10 -4
  59. nucliadb/search/search/graph_strategy.py +884 -0
  60. nucliadb/search/search/hydrator.py +6 -0
  61. nucliadb/search/search/merge.py +79 -24
  62. nucliadb/search/search/query.py +74 -245
  63. nucliadb/search/search/query_parser/exceptions.py +11 -1
  64. nucliadb/search/search/query_parser/fetcher.py +405 -0
  65. nucliadb/search/search/query_parser/models.py +0 -3
  66. nucliadb/search/search/query_parser/parser.py +22 -21
  67. nucliadb/search/search/rerankers.py +1 -42
  68. nucliadb/search/search/shards.py +19 -0
  69. nucliadb/standalone/api_router.py +2 -14
  70. nucliadb/standalone/settings.py +4 -0
  71. nucliadb/train/generators/field_streaming.py +7 -3
  72. nucliadb/train/lifecycle.py +3 -6
  73. nucliadb/train/nodes.py +14 -12
  74. nucliadb/train/resource.py +380 -0
  75. nucliadb/writer/api/constants.py +20 -16
  76. nucliadb/writer/api/v1/__init__.py +1 -0
  77. nucliadb/writer/api/v1/export_import.py +1 -1
  78. nucliadb/writer/api/v1/field.py +13 -7
  79. nucliadb/writer/api/v1/knowledgebox.py +3 -46
  80. nucliadb/writer/api/v1/resource.py +20 -13
  81. nucliadb/writer/api/v1/services.py +10 -1
  82. nucliadb/writer/api/v1/upload.py +61 -34
  83. nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
  84. nucliadb/writer/back_pressure.py +17 -46
  85. nucliadb/writer/resource/basic.py +9 -7
  86. nucliadb/writer/resource/field.py +42 -9
  87. nucliadb/writer/settings.py +2 -2
  88. nucliadb/writer/tus/gcs.py +11 -10
  89. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
  90. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
  91. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
  92. nucliadb/common/cluster/discovery/base.py +0 -178
  93. nucliadb/common/cluster/discovery/k8s.py +0 -301
  94. nucliadb/common/cluster/discovery/manual.py +0 -57
  95. nucliadb/common/cluster/discovery/single.py +0 -51
  96. nucliadb/common/cluster/discovery/types.py +0 -32
  97. nucliadb/common/cluster/discovery/utils.py +0 -67
  98. nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
  99. nucliadb/common/cluster/standalone/index_node.py +0 -123
  100. nucliadb/common/cluster/standalone/service.py +0 -84
  101. nucliadb/standalone/introspect.py +0 -208
  102. nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
  103. /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
  104. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
  105. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -23,12 +23,10 @@ import string
23
23
  from datetime import datetime
24
24
  from typing import Any, Awaitable, Optional, Union
25
25
 
26
- from async_lru import alru_cache
27
-
28
26
  from nucliadb.common import datamanagers
29
- from nucliadb.common.maindb.utils import get_driver
27
+ from nucliadb.common.models_utils.from_proto import RelationNodeTypeMap
30
28
  from nucliadb.search import logger
31
- from nucliadb.search.predict import SendToPredictError, convert_relations
29
+ from nucliadb.search.predict import SendToPredictError
32
30
  from nucliadb.search.search.filters import (
33
31
  convert_to_node_filters,
34
32
  flatten_filter_literals,
@@ -39,32 +37,31 @@ from nucliadb.search.search.filters import (
39
37
  )
40
38
  from nucliadb.search.search.metrics import (
41
39
  node_features,
42
- query_parse_dependency_observer,
43
40
  )
41
+ from nucliadb.search.search.query_parser.fetcher import Fetcher, get_classification_labels
44
42
  from nucliadb.search.search.rank_fusion import (
45
43
  RankFusionAlgorithm,
46
44
  )
47
45
  from nucliadb.search.search.rerankers import (
48
46
  Reranker,
49
47
  )
50
- from nucliadb.search.utilities import get_predict
51
48
  from nucliadb_models.internal.predict import QueryInfo
52
49
  from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
53
50
  from nucliadb_models.metadata import ResourceProcessingStatus
54
51
  from nucliadb_models.search import (
55
52
  Filter,
53
+ KnowledgeGraphEntity,
56
54
  MaxTokens,
57
55
  MinScore,
58
56
  SearchOptions,
59
57
  SortField,
60
- SortFieldMap,
61
58
  SortOptions,
62
59
  SortOrder,
63
60
  SortOrderMap,
64
61
  SuggestOptions,
65
62
  )
66
63
  from nucliadb_models.security import RequestSecurity
67
- from nucliadb_protos import knowledgebox_pb2, nodereader_pb2, utils_pb2
64
+ from nucliadb_protos import nodereader_pb2, utils_pb2
68
65
  from nucliadb_protos.noderesources_pb2 import Resource
69
66
 
70
67
  from .exceptions import InvalidQueryError
@@ -88,13 +85,6 @@ class QueryParser:
88
85
  """
89
86
 
90
87
  _query_information_task: Optional[asyncio.Task] = None
91
- _get_vectorset_task: Optional[asyncio.Task] = None
92
- _detected_entities_task: Optional[asyncio.Task] = None
93
- _entities_meta_cache_task: Optional[asyncio.Task] = None
94
- _deleted_entities_groups_task: Optional[asyncio.Task] = None
95
- _synonyms_task: Optional[asyncio.Task] = None
96
- _get_classification_labels_task: Optional[asyncio.Task] = None
97
- _get_matryoshka_dimension_task: Optional[asyncio.Task] = None
98
88
 
99
89
  def __init__(
100
90
  self,
@@ -106,6 +96,7 @@ class QueryParser:
106
96
  keyword_filters: Union[list[str], list[Filter]],
107
97
  top_k: int,
108
98
  min_score: MinScore,
99
+ query_entities: Optional[list[KnowledgeGraphEntity]] = None,
109
100
  faceted: Optional[list[str]] = None,
110
101
  sort: Optional[SortOptions] = None,
111
102
  range_creation_start: Optional[datetime] = None,
@@ -132,6 +123,7 @@ class QueryParser:
132
123
  self.kbid = kbid
133
124
  self.features = features
134
125
  self.query = query
126
+ self.query_entities = query_entities
135
127
  self.hidden = hidden
136
128
  if self.hidden is not None:
137
129
  if self.hidden:
@@ -169,6 +161,15 @@ class QueryParser:
169
161
  self.max_tokens = max_tokens
170
162
  self.rank_fusion = rank_fusion
171
163
  self.reranker = reranker
164
+ self.fetcher = Fetcher(
165
+ kbid=kbid,
166
+ query=query,
167
+ user_vector=user_vector,
168
+ vectorset=vectorset,
169
+ rephrase=rephrase,
170
+ rephrase_prompt=rephrase_prompt,
171
+ generative_model=generative_model,
172
+ )
172
173
 
173
174
  @property
174
175
  def has_vector_search(self) -> bool:
@@ -184,78 +185,12 @@ class QueryParser:
184
185
  return self._query_information_task
185
186
 
186
187
  async def _query_information(self) -> QueryInfo:
187
- vectorset = await self.select_query_vectorset()
188
- return await query_information(
189
- self.kbid, self.query, vectorset, self.generative_model, self.rephrase, self.rephrase_prompt
190
- )
191
-
192
- def _get_vectorset(self) -> Awaitable[Optional[str]]:
193
- if self._get_vectorset_task is None:
194
- self._get_vectorset_task = asyncio.create_task(self._select_vectorset())
195
- return self._get_vectorset_task
196
-
197
- async def _select_vectorset(self) -> Optional[str]:
198
- if self.vectorset:
199
- return self.vectorset
200
-
201
- # When vectorset is not provided we get the default from Predict API
202
-
203
- try:
204
- query_information = await self._get_query_information()
205
- except SendToPredictError:
206
- return None
207
-
208
- if query_information.sentence is None:
209
- logger.error(
210
- "Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
211
- )
212
- return None
213
-
214
- for vectorset in query_information.sentence.vectors.keys():
215
- self.vectorset = vectorset
216
- break
217
-
218
- return self.vectorset
219
-
220
- def _get_matryoshka_dimension(self) -> Awaitable[Optional[int]]:
221
- if self._get_matryoshka_dimension_task is None:
222
- self._get_matryoshka_dimension_task = asyncio.create_task(self._matryoshka_dimension())
223
- return self._get_matryoshka_dimension_task
224
-
225
- async def _matryoshka_dimension(self) -> Optional[int]:
226
- vectorset = await self._select_vectorset()
227
- return await get_matryoshka_dimension_cached(self.kbid, vectorset)
228
-
229
- def _get_detected_entities(self) -> Awaitable[list[utils_pb2.RelationNode]]:
230
- if self._detected_entities_task is None: # pragma: no cover
231
- self._detected_entities_task = asyncio.create_task(detect_entities(self.kbid, self.query))
232
- return self._detected_entities_task
233
-
234
- def _get_entities_meta_cache(
235
- self,
236
- ) -> Awaitable[datamanagers.entities.EntitiesMetaCache]:
237
- if self._entities_meta_cache_task is None:
238
- self._entities_meta_cache_task = asyncio.create_task(get_entities_meta_cache(self.kbid))
239
- return self._entities_meta_cache_task
240
-
241
- def _get_deleted_entity_groups(self) -> Awaitable[list[str]]:
242
- if self._deleted_entities_groups_task is None:
243
- self._deleted_entities_groups_task = asyncio.create_task(
244
- get_deleted_entity_groups(self.kbid)
245
- )
246
- return self._deleted_entities_groups_task
247
-
248
- def _get_synomyns(self) -> Awaitable[Optional[knowledgebox_pb2.Synonyms]]:
249
- if self._synonyms_task is None:
250
- self._synonyms_task = asyncio.create_task(get_kb_synonyms(self.kbid))
251
- return self._synonyms_task
252
-
253
- def _get_classification_labels(self) -> Awaitable[knowledgebox_pb2.Labels]:
254
- if self._get_classification_labels_task is None:
255
- self._get_classification_labels_task = asyncio.create_task(
256
- get_classification_labels(self.kbid)
257
- )
258
- return self._get_classification_labels_task
188
+ # HACK: while transitioning to the new query parser, use fetcher under
189
+ # the hood for a smoother migration
190
+ query_info = await self.fetcher._predict_query_endpoint()
191
+ if query_info is None:
192
+ raise SendToPredictError("Error while using predict's query endpoint")
193
+ return query_info
259
194
 
260
195
  async def _schedule_dependency_tasks(self) -> None:
261
196
  """
@@ -263,23 +198,24 @@ class QueryParser:
263
198
  for the sake of the query being performed
264
199
  """
265
200
  if len(self.label_filters) > 0 and has_classification_label_filters(self.flat_label_filters):
266
- asyncio.ensure_future(self._get_classification_labels())
201
+ asyncio.ensure_future(self.fetcher.get_classification_labels())
267
202
 
268
203
  if self.has_vector_search and self.user_vector is None:
269
204
  self.query_endpoint_used = True
270
205
  asyncio.ensure_future(self._get_query_information())
271
- asyncio.ensure_future(self._get_matryoshka_dimension())
206
+ # XXX: should we also ensure get_vectorset and get_query_vector?
207
+ asyncio.ensure_future(self.fetcher.get_matryoshka_dimension())
272
208
 
273
209
  if (self.has_relations_search or self.autofilter) and len(self.query) > 0:
274
210
  if not self.query_endpoint_used:
275
211
  # If we only need to detect entities, we don't need the query endpoint
276
- asyncio.ensure_future(self._get_detected_entities())
277
- asyncio.ensure_future(self._get_entities_meta_cache())
278
- asyncio.ensure_future(self._get_deleted_entity_groups())
212
+ asyncio.ensure_future(self.fetcher.get_detected_entities())
213
+ asyncio.ensure_future(self.fetcher.get_entities_meta_cache())
214
+ asyncio.ensure_future(self.fetcher.get_deleted_entity_groups())
279
215
  if self.with_synonyms and self.query:
280
- asyncio.ensure_future(self._get_synomyns())
216
+ asyncio.ensure_future(self.fetcher.get_synonyms())
281
217
 
282
- async def parse(self) -> tuple[nodereader_pb2.SearchRequest, bool, list[str]]:
218
+ async def parse(self) -> tuple[nodereader_pb2.SearchRequest, bool, list[str], Optional[str]]:
283
219
  """
284
220
  :return: (request, incomplete, autofilters)
285
221
  where:
@@ -298,19 +234,20 @@ class QueryParser:
298
234
  await self.parse_filters(request)
299
235
  self.parse_document_search(request)
300
236
  self.parse_paragraph_search(request)
301
- incomplete = await self.parse_vector_search(request)
237
+ incomplete, rephrased_query = await self.parse_vector_search(request)
238
+ # BUG: autofilters are not used to filter, but we say we do
302
239
  autofilters = await self.parse_relation_search(request)
303
240
  await self.parse_synonyms(request)
304
241
  await self.parse_min_score(request, incomplete)
305
242
  await self.adjust_page_size(request, self.rank_fusion, self.reranker)
306
- return request, incomplete, autofilters
243
+ return request, incomplete, autofilters, rephrased_query
307
244
 
308
245
  async def parse_filters(self, request: nodereader_pb2.SearchRequest) -> None:
309
246
  if len(self.label_filters) > 0:
310
247
  field_labels = self.flat_label_filters
311
248
  paragraph_labels: list[str] = []
312
249
  if has_classification_label_filters(self.flat_label_filters):
313
- classification_labels = await self._get_classification_labels()
250
+ classification_labels = await self.fetcher.get_classification_labels()
314
251
  field_labels, paragraph_labels = split_labels_by_type(
315
252
  self.flat_label_filters, classification_labels
316
253
  )
@@ -388,7 +325,7 @@ class QueryParser:
388
325
  else:
389
326
  request.result_per_page = self.top_k
390
327
 
391
- sort_field = SortFieldMap[self.sort.field] if self.sort else None
328
+ sort_field = get_sort_field_proto(self.sort.field) if self.sort else None
392
329
  if sort_field is not None:
393
330
  request.order.sort_by = sort_field
394
331
  request.order.type = SortOrderMap[self.sort.order] # type: ignore
@@ -399,19 +336,13 @@ class QueryParser:
399
336
  semantic_min_score = self.min_score.semantic
400
337
  elif self.has_vector_search and not incomplete:
401
338
  query_information = await self._get_query_information()
402
- vectorset = await self._select_vectorset()
403
- if vectorset is not None:
404
- semantic_threshold = query_information.semantic_thresholds.get(vectorset, None)
405
- if semantic_threshold is not None:
406
- semantic_min_score = semantic_threshold
407
- else:
408
- logger.warning(
409
- "Semantic threshold not found in query information, using default",
410
- extra={"kbid": self.kbid},
411
- )
339
+ vectorset = await self.fetcher.get_vectorset()
340
+ semantic_threshold = query_information.semantic_thresholds.get(vectorset, None)
341
+ if semantic_threshold is not None:
342
+ semantic_min_score = semantic_threshold
412
343
  else:
413
344
  logger.warning(
414
- "Vectorset unset by user or predict, using default semantic threshold",
345
+ "Semantic threshold not found in query information, using default",
415
346
  extra={"kbid": self.kbid},
416
347
  )
417
348
  self.min_score.semantic = semantic_min_score
@@ -428,91 +359,49 @@ class QueryParser:
428
359
  request.paragraph = True
429
360
  node_features.inc({"type": "paragraphs"})
430
361
 
431
- async def select_query_vectorset(self) -> Optional[str]:
432
- """Set and return the requested vectorset parameter (if used) validated
433
- for the current KB.
434
-
435
- """
436
- if not self.vectorset:
437
- return None
438
-
439
- # validate vectorset
440
- async with datamanagers.with_ro_transaction() as txn:
441
- if not await datamanagers.vectorsets.exists(
442
- txn, kbid=self.kbid, vectorset_id=self.vectorset
443
- ):
444
- raise InvalidQueryError(
445
- "vectorset",
446
- f"Vectorset {self.vectorset} doesn't exist in you Knowledge Box",
447
- )
448
- return self.vectorset
449
-
450
- async def parse_vector_search(self, request: nodereader_pb2.SearchRequest) -> bool:
362
+ async def parse_vector_search(
363
+ self, request: nodereader_pb2.SearchRequest
364
+ ) -> tuple[bool, Optional[str]]:
451
365
  if not self.has_vector_search:
452
- return False
366
+ return False, None
453
367
 
454
368
  node_features.inc({"type": "vectors"})
455
369
 
456
- incomplete = False
457
-
458
- vectorset = await self._select_vectorset()
459
- if vectorset is not None:
460
- request.vectorset = vectorset
461
-
462
- query_vector = None
463
- if self.user_vector is None:
464
- try:
465
- query_info = await self._get_query_information()
466
- except SendToPredictError as err:
467
- logger.warning(f"Errors on predict api trying to embedd query: {err}")
468
- incomplete = True
469
- else:
470
- if query_info and query_info.sentence:
471
- if vectorset:
472
- if vectorset in query_info.sentence.vectors:
473
- query_vector = query_info.sentence.vectors[vectorset]
474
- else:
475
- incomplete = True
476
- else:
477
- for vectorset_id, vector in query_info.sentence.vectors.items():
478
- if vector:
479
- query_vector = vector
480
- break
481
- else:
482
- incomplete = True
483
-
484
- else:
485
- incomplete = True
486
- else:
487
- query_vector = self.user_vector
370
+ vectorset = await self.fetcher.get_vectorset()
371
+ query_vector = await self.fetcher.get_query_vector()
372
+ rephrased_query = await self.fetcher.get_rephrased_query()
373
+ incomplete = query_vector is None
488
374
 
375
+ request.vectorset = vectorset
489
376
  if query_vector is not None:
490
- matryoshka_dimension = await self._get_matryoshka_dimension()
491
- if matryoshka_dimension is not None:
492
- # KB using a matryoshka embeddings model, cut the query vector
493
- # accordingly
494
- query_vector = query_vector[:matryoshka_dimension]
495
377
  request.vector.extend(query_vector)
496
378
 
497
- return incomplete
379
+ return incomplete, rephrased_query
498
380
 
499
381
  async def parse_relation_search(self, request: nodereader_pb2.SearchRequest) -> list[str]:
500
382
  autofilters = []
383
+ # BUG: autofiler should autofilter, not enable relation search
501
384
  if self.has_relations_search or self.autofilter:
502
- if not self.query_endpoint_used:
503
- detected_entities = await self._get_detected_entities()
385
+ if self.query_entities:
386
+ detected_entities = []
387
+ for entity in self.query_entities:
388
+ relation_node = utils_pb2.RelationNode()
389
+ relation_node.value = entity.name
390
+ if entity.type is not None:
391
+ relation_node.ntype = RelationNodeTypeMap[entity.type]
392
+ if entity.subtype is not None:
393
+ relation_node.subtype = entity.subtype
394
+ detected_entities.append(relation_node)
504
395
  else:
505
- query_info_result = await self._get_query_information()
506
- if query_info_result.entities:
507
- detected_entities = convert_relations(query_info_result.entities.model_dump())
508
- else:
509
- detected_entities = []
510
- meta_cache = await self._get_entities_meta_cache()
396
+ detected_entities = await self.fetcher.get_detected_entities()
397
+ meta_cache = await self.fetcher.get_entities_meta_cache()
511
398
  detected_entities = expand_entities(meta_cache, detected_entities)
512
399
  if self.has_relations_search:
513
400
  request.relation_subgraph.entry_points.extend(detected_entities)
514
401
  request.relation_subgraph.depth = 1
515
- request.relation_subgraph.deleted_groups.extend(await self._get_deleted_entity_groups())
402
+ request.relation_subgraph.deleted_groups.extend(
403
+ await self.fetcher.get_deleted_entity_groups()
404
+ )
516
405
  for group_id, deleted_entities in meta_cache.deleted_entities.items():
517
406
  request.relation_subgraph.deleted_entities.append(
518
407
  nodereader_pb2.EntitiesSubgraphRequest.DeletedEntities(
@@ -545,7 +434,7 @@ class QueryParser:
545
434
  "Search with custom synonyms is only supported on paragraph and document search",
546
435
  )
547
436
 
548
- synonyms = await self._get_synomyns()
437
+ synonyms = await self.fetcher.get_synonyms()
549
438
  if synonyms is None:
550
439
  # No synonyms found
551
440
  return
@@ -681,29 +570,6 @@ async def paragraph_query_to_pb(
681
570
  return request
682
571
 
683
572
 
684
- @query_parse_dependency_observer.wrap({"type": "query_information"})
685
- async def query_information(
686
- kbid: str,
687
- query: str,
688
- semantic_model: Optional[str],
689
- generative_model: Optional[str] = None,
690
- rephrase: bool = False,
691
- rephrase_prompt: Optional[str] = None,
692
- ) -> QueryInfo:
693
- predict = get_predict()
694
- return await predict.query(kbid, query, semantic_model, generative_model, rephrase, rephrase_prompt)
695
-
696
-
697
- @query_parse_dependency_observer.wrap({"type": "detect_entities"})
698
- async def detect_entities(kbid: str, query: str) -> list[utils_pb2.RelationNode]:
699
- predict = get_predict()
700
- try:
701
- return await predict.detect_entities(kbid, query)
702
- except SendToPredictError as ex:
703
- logger.warning(f"Errors on predict api detecting entities: {ex}")
704
- return []
705
-
706
-
707
573
  def expand_entities(
708
574
  meta_cache: datamanagers.entities.EntitiesMetaCache,
709
575
  detected_entities: list[utils_pb2.RelationNode],
@@ -834,30 +700,6 @@ PROCESSING_STATUS_TO_PB_MAP = {
834
700
  }
835
701
 
836
702
 
837
- @query_parse_dependency_observer.wrap({"type": "synonyms"})
838
- async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
839
- async with get_driver().transaction(read_only=True) as txn:
840
- return await datamanagers.synonyms.get(txn, kbid=kbid)
841
-
842
-
843
- @query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
844
- async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
845
- async with get_driver().transaction(read_only=True) as txn:
846
- return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
847
-
848
-
849
- @query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
850
- async def get_deleted_entity_groups(kbid: str) -> list[str]:
851
- async with get_driver().transaction(read_only=True) as txn:
852
- return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)
853
-
854
-
855
- @query_parse_dependency_observer.wrap({"type": "classification_labels"})
856
- async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
857
- async with get_driver().transaction(read_only=True) as txn:
858
- return await datamanagers.labels.get_labels(txn, kbid=kbid)
859
-
860
-
861
703
  def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]):
862
704
  """
863
705
  Check if the provided filters are supported:
@@ -890,23 +732,10 @@ def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]
890
732
  )
891
733
 
892
734
 
893
- @alru_cache(maxsize=None)
894
- async def get_matryoshka_dimension_cached(kbid: str, vectorset: Optional[str]) -> Optional[int]:
895
- # This can be safely cached as the matryoshka dimension is not expected to change
896
- return await get_matryoshka_dimension(kbid, vectorset)
897
-
898
-
899
- @query_parse_dependency_observer.wrap({"type": "matryoshka_dimension"})
900
- async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
901
- async with get_driver().transaction(read_only=True) as txn:
902
- matryoshka_dimension = None
903
- if not vectorset:
904
- # XXX this should be migrated once we remove the "default" vectorset
905
- # concept
906
- matryoshka_dimension = await datamanagers.kb.get_matryoshka_vector_dimension(txn, kbid=kbid)
907
- else:
908
- vectorset_config = await datamanagers.vectorsets.get(txn, kbid=kbid, vectorset_id=vectorset)
909
- if vectorset_config is not None and vectorset_config.vectorset_index_config.vector_dimension:
910
- matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension
911
-
912
- return matryoshka_dimension
735
+ def get_sort_field_proto(obj: SortField) -> Optional[nodereader_pb2.OrderBy.OrderField.ValueType]:
736
+ return {
737
+ SortField.SCORE: None,
738
+ SortField.CREATED: nodereader_pb2.OrderBy.OrderField.CREATED,
739
+ SortField.MODIFIED: nodereader_pb2.OrderBy.OrderField.MODIFIED,
740
+ SortField.TITLE: None,
741
+ }[obj]
@@ -19,4 +19,14 @@
19
19
  #
20
20
 
21
21
 
22
- class ParserError(ValueError): ...
22
+ class InternalParserError(ValueError):
23
+ """Raised when parsing fails due to some internal error"""
24
+
25
+
26
+ class InvalidQueryError(Exception):
27
+ """Raised when parsing a query containing an invalid parameter"""
28
+
29
+ def __init__(self, param: str, reason: str):
30
+ self.param = param
31
+ self.reason = reason
32
+ super().__init__(f"Invalid query. Error in {param}: {reason}")