nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -18,10 +18,12 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
+ from typing_extensions import assert_never
22
+
21
23
  from nucliadb.common import datamanagers
22
24
  from nucliadb.common.catalog.interface import CatalogExpression, CatalogQuery
23
25
  from nucliadb.common.exceptions import InvalidQueryError
24
- from nucliadb.common.filter_expression import FacetFilterTypes, facet_from_filter
26
+ from nucliadb.common.filter_expression import FacetFilter, facet_from_filter
25
27
  from nucliadb.search.search.filters import translate_label
26
28
  from nucliadb_models import search as search_models
27
29
  from nucliadb_models.filters import (
@@ -78,7 +80,6 @@ async def parse_catalog(kbid: str, item: search_models.CatalogRequest) -> Catalo
78
80
  sort = SortOptions(
79
81
  field=SortField.CREATED,
80
82
  order=SortOrder.DESC,
81
- limit=None,
82
83
  )
83
84
 
84
85
  if isinstance(item.query, search_models.CatalogQuery):
@@ -189,11 +190,9 @@ async def parse_filter_expression(expr: ResourceFilterExpression, kbid: str) ->
189
190
  cat.date = CatalogExpression.Date(field="created_at", since=expr.since, until=expr.until)
190
191
  elif isinstance(expr, DateModified):
191
192
  cat.date = CatalogExpression.Date(field="modified_at", since=expr.since, until=expr.until)
192
- elif isinstance(expr, FacetFilterTypes):
193
+ elif isinstance(expr, FacetFilter):
193
194
  cat.facet = facet_from_filter(expr)
194
195
  else:
195
- # This is a trick so mypy generates an error if this branch can be reached,
196
- # that is, if we are missing some ifs
197
- _a: int = "a"
196
+ assert_never(expr)
198
197
 
199
198
  return cat
@@ -19,7 +19,6 @@
19
19
  #
20
20
  import re
21
21
  import string
22
- from typing import Optional, Union
23
22
 
24
23
  from nucliadb.search import logger
25
24
  from nucliadb.search.search.query_parser.fetcher import Fetcher
@@ -117,7 +116,7 @@ async def parse_keyword_query(
117
116
 
118
117
 
119
118
  async def parse_semantic_query(
120
- item: Union[search_models.SearchRequest, search_models.FindRequest],
119
+ item: search_models.SearchRequest | search_models.FindRequest,
121
120
  *,
122
121
  fetcher: Fetcher,
123
122
  ) -> SemanticQuery:
@@ -130,7 +129,7 @@ async def parse_semantic_query(
130
129
 
131
130
 
132
131
  def parse_keyword_min_score(
133
- min_score: Optional[Union[float, search_models.MinScore]],
132
+ min_score: float | search_models.MinScore | None,
134
133
  ) -> float:
135
134
  # Keep backward compatibility with the deprecated min_score payload
136
135
  # parameter being a float (specifying semantic)
@@ -141,7 +140,7 @@ def parse_keyword_min_score(
141
140
 
142
141
 
143
142
  async def parse_semantic_min_score(
144
- min_score: Optional[Union[float, search_models.MinScore]],
143
+ min_score: float | search_models.MinScore | None,
145
144
  *,
146
145
  fetcher: Fetcher,
147
146
  ):
@@ -170,7 +169,7 @@ async def query_with_synonyms(
170
169
  query: str,
171
170
  *,
172
171
  fetcher: Fetcher,
173
- ) -> Optional[str]:
172
+ ) -> str | None:
174
173
  """
175
174
  Replace the terms in the query with an expression that will make it match with the configured synonyms.
176
175
  We're using the Tantivy's query language here: https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html
@@ -192,7 +191,7 @@ async def query_with_synonyms(
192
191
  variants: dict[str, str] = {}
193
192
  for term, term_synonyms in synonyms.terms.items():
194
193
  if len(term_synonyms.synonyms) > 0:
195
- variants[term] = "({})".format(" OR ".join([term] + list(term_synonyms.synonyms)))
194
+ variants[term] = "({})".format(" OR ".join([term, *list(term_synonyms.synonyms)]))
196
195
 
197
196
  # Split the query into terms
198
197
  query_terms = query.split()
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional
22
21
 
23
22
  from nidx_protos import nodereader_pb2
24
23
  from pydantic import ValidationError
@@ -27,7 +26,6 @@ from nucliadb.common.exceptions import InvalidQueryError
27
26
  from nucliadb.common.filter_expression import parse_expression
28
27
  from nucliadb.common.models_utils.from_proto import RelationNodeTypeMap
29
28
  from nucliadb.search.search.metrics import query_parser_observer
30
- from nucliadb.search.search.query import expand_entities
31
29
  from nucliadb.search.search.query_parser.exceptions import InternalParserError
32
30
  from nucliadb.search.search.query_parser.fetcher import Fetcher
33
31
  from nucliadb.search.search.query_parser.models import (
@@ -65,7 +63,7 @@ async def parse_find(
65
63
  kbid: str,
66
64
  item: FindRequest,
67
65
  *,
68
- fetcher: Optional[Fetcher] = None,
66
+ fetcher: Fetcher | None = None,
69
67
  ) -> ParsedQuery:
70
68
  fetcher = fetcher or fetcher_for_find(kbid, item)
71
69
  parser = _FindParser(kbid, item, fetcher)
@@ -93,8 +91,8 @@ class _FindParser:
93
91
  self.fetcher = fetcher
94
92
 
95
93
  # cached data while parsing
96
- self._query: Optional[Query] = None
97
- self._top_k: Optional[int] = None
94
+ self._query: Query | None = None
95
+ self._top_k: int | None = None
98
96
 
99
97
  async def parse(self) -> UnitRetrieval:
100
98
  self._validate_request()
@@ -122,11 +120,11 @@ class _FindParser:
122
120
  try:
123
121
  rank_fusion = self._parse_rank_fusion()
124
122
  except ValidationError as exc:
125
- raise InternalParserError(f"Parsing error in rank fusion: {str(exc)}") from exc
123
+ raise InternalParserError(f"Parsing error in rank fusion: {exc!s}") from exc
126
124
  try:
127
125
  reranker = self._parse_reranker()
128
126
  except ValidationError as exc:
129
- raise InternalParserError(f"Parsing error in reranker: {str(exc)}") from exc
127
+ raise InternalParserError(f"Parsing error in reranker: {exc!s}") from exc
130
128
 
131
129
  # Adjust retrieval windows. Our current implementation assume:
132
130
  # `top_k <= reranker.window <= rank_fusion.window`
@@ -170,15 +168,8 @@ class _FindParser:
170
168
  async def _parse_relation_query(self) -> RelationQuery:
171
169
  detected_entities = await self._get_detected_entities()
172
170
 
173
- deleted_entity_groups = await self.fetcher.get_deleted_entity_groups()
174
-
175
- meta_cache = await self.fetcher.get_entities_meta_cache()
176
- deleted_entities = meta_cache.deleted_entities
177
-
178
171
  return RelationQuery(
179
- entry_points=detected_entities,
180
- deleted_entity_groups=deleted_entity_groups,
181
- deleted_entities=deleted_entities,
172
+ entry_points=detected_entities, deleted_entity_groups=[], deleted_entities={}
182
173
  )
183
174
 
184
175
  async def _parse_graph_query(self) -> GraphQuery:
@@ -205,9 +196,6 @@ class _FindParser:
205
196
  else:
206
197
  detected_entities = await self.fetcher.get_detected_entities()
207
198
 
208
- meta_cache = await self.fetcher.get_entities_meta_cache()
209
- detected_entities = expand_entities(meta_cache, detected_entities)
210
-
211
199
  return detected_entities
212
200
 
213
201
  async def _parse_filters(self) -> Filters:
@@ -253,17 +241,9 @@ class _FindParser:
253
241
  else:
254
242
  filter_operator = nodereader_pb2.FilterOperator.AND
255
243
 
256
- autofilter = None
257
- if self.item.autofilter:
258
- if self._query.relation is not None:
259
- autofilter = self._query.relation.entry_points
260
- else:
261
- autofilter = await self._get_detected_entities()
262
-
263
244
  hidden = await filter_hidden_resources(self.kbid, self.item.show_hidden)
264
245
 
265
246
  return Filters(
266
- autofilter=autofilter,
267
247
  facets=[],
268
248
  field_expression=field_expr,
269
249
  paragraph_expression=paragraph_expr,
@@ -18,9 +18,9 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional, Union
22
21
 
23
22
  from nidx_protos import nodereader_pb2
23
+ from typing_extensions import assert_never
24
24
 
25
25
  from nucliadb.common.filter_expression import add_and_expression, parse_expression
26
26
  from nucliadb.common.models_utils.from_proto import RelationNodeTypeMap, RelationTypeMap
@@ -56,11 +56,11 @@ async def parse_graph_relation_search(
56
56
  return pb
57
57
 
58
58
 
59
- AnyGraphRequest = Union[
60
- graph_requests.GraphSearchRequest,
61
- graph_requests.GraphNodesSearchRequest,
62
- graph_requests.GraphRelationsSearchRequest,
63
- ]
59
+ AnyGraphRequest = (
60
+ graph_requests.GraphSearchRequest
61
+ | graph_requests.GraphNodesSearchRequest
62
+ | graph_requests.GraphRelationsSearchRequest
63
+ )
64
64
 
65
65
 
66
66
  async def _parse_common(kbid: str, item: AnyGraphRequest) -> nodereader_pb2.GraphSearchRequest:
@@ -78,7 +78,7 @@ async def _parse_common(kbid: str, item: AnyGraphRequest) -> nodereader_pb2.Grap
78
78
  return pb
79
79
 
80
80
 
81
- async def _parse_filters(kbid: str, item: AnyGraphRequest) -> Optional[nodereader_pb2.FilterExpression]:
81
+ async def _parse_filters(kbid: str, item: AnyGraphRequest) -> nodereader_pb2.FilterExpression | None:
82
82
  filter_expr = nodereader_pb2.FilterExpression()
83
83
  if item.filter_expression:
84
84
  if item.filter_expression.field:
@@ -100,7 +100,7 @@ async def _parse_filters(kbid: str, item: AnyGraphRequest) -> Optional[nodereade
100
100
  return None
101
101
 
102
102
 
103
- def _parse_security(kbid: str, item: AnyGraphRequest) -> Optional[utils_pb2.Security]:
103
+ def _parse_security(kbid: str, item: AnyGraphRequest) -> utils_pb2.Security | None:
104
104
  if item.security is not None and len(item.security.groups) > 0:
105
105
  security_pb = utils_pb2.Security()
106
106
  for group_id in item.security.groups:
@@ -154,9 +154,7 @@ def parse_path_query(expr: graph_requests.GraphPathQuery) -> nodereader_pb2.Grap
154
154
  _set_generated_to_pb(expr, pb)
155
155
 
156
156
  else: # pragma: no cover
157
- # This is a trick so mypy generates an error if this branch can be reached,
158
- # that is, if we are missing some ifs
159
- _a: int = "a"
157
+ assert_never(expr)
160
158
 
161
159
  return pb
162
160
 
@@ -183,9 +181,7 @@ def _parse_node_query(expr: graph_requests.GraphNodesQuery) -> nodereader_pb2.Gr
183
181
  _set_generated_to_pb(expr, pb)
184
182
 
185
183
  else: # pragma: no cover
186
- # This is a trick so mypy generates an error if this branch can be reached,
187
- # that is, if we are missing some ifs
188
- _a: int = "a"
184
+ assert_never(expr)
189
185
 
190
186
  return pb
191
187
 
@@ -213,9 +209,7 @@ def _parse_relation_query(
213
209
  _set_generated_to_pb(expr, pb)
214
210
 
215
211
  else: # pragma: no cover
216
- # This is a trick so mypy generates an error if this branch can be reached,
217
- # that is, if we are missing some ifs
218
- _a: int = "a"
212
+ assert_never(expr)
219
213
 
220
214
  return pb
221
215
 
@@ -231,9 +225,7 @@ def _set_node_to_pb(node: graph_requests.GraphNode, pb: nodereader_pb2.GraphQuer
231
225
  pb.fuzzy.distance = 1
232
226
 
233
227
  else: # pragma: no cover
234
- # This is a trick so mypy generates an error if this branch can be reached,
235
- # that is, if we are missing some ifs
236
- _a: int = "a"
228
+ assert_never(node.match)
237
229
 
238
230
  if node.type is not None:
239
231
  pb.node_type = RelationNodeTypeMap[node.type]
@@ -264,6 +256,4 @@ def _set_generated_to_pb(generated: graph_requests.Generated, pb: nodereader_pb2
264
256
  pb.facet.facet = facet
265
257
 
266
258
  else: # pragma: no cover
267
- # This is a trick so mypy generates an error if this branch can be reached,
268
- # that is, if we are missing some ifs
269
- _a: int = "a"
259
+ assert_never(generated.by)
@@ -0,0 +1,207 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from nidx_protos import nodereader_pb2
21
+ from pydantic import ValidationError
22
+
23
+ from nucliadb.common.exceptions import InvalidQueryError
24
+ from nucliadb.common.filter_expression import parse_expression
25
+ from nucliadb.search.search.metrics import query_parser_observer
26
+ from nucliadb.search.search.query_parser.exceptions import InternalParserError
27
+ from nucliadb.search.search.query_parser.fetcher import Fetcher
28
+ from nucliadb.search.search.query_parser.models import (
29
+ Filters,
30
+ GraphQuery,
31
+ KeywordQuery,
32
+ Query,
33
+ RankFusion,
34
+ ReciprocalRankFusion,
35
+ SemanticQuery,
36
+ UnitRetrieval,
37
+ )
38
+ from nucliadb.search.search.query_parser.parsers.common import query_with_synonyms, validate_query_syntax
39
+ from nucliadb.search.search.utils import filter_hidden_resources
40
+ from nucliadb_models import search as search_models
41
+ from nucliadb_models.filters import FilterExpression
42
+ from nucliadb_models.retrieval import RetrievalRequest
43
+ from nucliadb_models.search import MAX_RANK_FUSION_WINDOW
44
+
45
+
46
+ @query_parser_observer.wrap({"type": "parse_retrieve"})
47
+ async def parse_retrieve(kbid: str, item: RetrievalRequest) -> UnitRetrieval:
48
+ fetcher = Fetcher(
49
+ kbid=kbid,
50
+ query=item.query.keyword.query if item.query.keyword else "",
51
+ user_vector=item.query.semantic.query if item.query.semantic else None,
52
+ vectorset=item.query.semantic.vectorset if item.query.semantic else None,
53
+ # Retrieve doesn't use images for now
54
+ query_image=None,
55
+ # Retrieve doesn't do rephrasing
56
+ rephrase=False,
57
+ rephrase_prompt=None,
58
+ generative_model=None,
59
+ )
60
+ parser = _RetrievalParser(kbid, item, fetcher)
61
+ retrieval = await parser.parse()
62
+ return retrieval
63
+
64
+
65
+ class _RetrievalParser:
66
+ def __init__(self, kbid: str, item: RetrievalRequest, fetcher: Fetcher):
67
+ self.kbid = kbid
68
+ self.item = item
69
+ self.fetcher = fetcher
70
+
71
+ async def parse(self) -> UnitRetrieval:
72
+ top_k = self.item.top_k
73
+ query = await self._parse_query()
74
+ filters = await self._parse_filters()
75
+ try:
76
+ rank_fusion = self._parse_rank_fusion()
77
+ except ValidationError as exc:
78
+ raise InternalParserError(f"Parsing error in rank fusion: {exc!s}") from exc
79
+
80
+ # ensure top_k and rank_fusion are coherent
81
+ if top_k > rank_fusion.window:
82
+ raise InvalidQueryError(
83
+ "rank_fusion.window", "Rank fusion window must be greater or equal to top_k"
84
+ )
85
+
86
+ retrieval = UnitRetrieval(
87
+ query=query,
88
+ top_k=top_k,
89
+ filters=filters,
90
+ rank_fusion=rank_fusion,
91
+ reranker=None,
92
+ )
93
+ return retrieval
94
+
95
+ async def _parse_query(self) -> Query:
96
+ keyword = None
97
+ if self.item.query.keyword is not None:
98
+ keyword_query, is_synonyms_query = await self._parse_keyword_query()
99
+ keyword = KeywordQuery(
100
+ query=keyword_query,
101
+ is_synonyms_query=is_synonyms_query,
102
+ min_score=self.item.query.keyword.min_score,
103
+ )
104
+
105
+ semantic = None
106
+ if self.item.query.semantic is not None:
107
+ vectorset, query_vector = await self._parse_semantic_query()
108
+ semantic = SemanticQuery(
109
+ query=query_vector,
110
+ vectorset=vectorset,
111
+ min_score=self.item.query.semantic.min_score,
112
+ )
113
+
114
+ graph = None
115
+ if self.item.query.graph is not None:
116
+ graph = GraphQuery(query=self.item.query.graph.query)
117
+
118
+ return Query(keyword=keyword, semantic=semantic, graph=graph)
119
+
120
+ async def _parse_keyword_query(self) -> tuple[str, bool]:
121
+ assert self.item.query.keyword is not None
122
+ keyword_query = self.item.query.keyword.query
123
+ is_synonyms_query = False
124
+ if self.item.query.keyword.with_synonyms:
125
+ synonyms_query = await query_with_synonyms(keyword_query, fetcher=self.fetcher)
126
+ if synonyms_query is not None:
127
+ keyword_query = synonyms_query
128
+ is_synonyms_query = True
129
+
130
+ # after all query transformations, pass a validator that can fix some
131
+ # queries that trigger a panic on the index
132
+ keyword_query = validate_query_syntax(keyword_query)
133
+ return keyword_query, is_synonyms_query
134
+
135
+ async def _parse_semantic_query(self) -> tuple[str, list[float]]:
136
+ # Make sure the vectorset exists in the KB
137
+ assert self.item.query.semantic is not None
138
+ vectorset = self.item.query.semantic.vectorset
139
+ await self.fetcher.validate_vectorset(self.kbid, vectorset)
140
+
141
+ # Calculate the matryoshka dimension if applicable
142
+ user_vector = self.item.query.semantic.query
143
+ matryoshka_dimension = await self.fetcher.get_matryoshka_dimension_cached(self.kbid, vectorset)
144
+ if matryoshka_dimension is not None:
145
+ if len(user_vector) < matryoshka_dimension:
146
+ raise InvalidQueryError(
147
+ "vector",
148
+ f"Invalid vector length, please check valid embedding size for {vectorset} model",
149
+ )
150
+
151
+ # KB using a matryoshka embeddings model, cut the query vector
152
+ # accordingly
153
+ query_vector = user_vector[:matryoshka_dimension]
154
+ return vectorset, query_vector
155
+
156
+ async def _parse_filters(self) -> Filters:
157
+ filters = Filters()
158
+ if self.item.filters is None:
159
+ return filters
160
+
161
+ if self.item.filters.filter_expression is not None:
162
+ if self.item.filters.filter_expression.field is not None:
163
+ filters.field_expression = await parse_expression(
164
+ self.item.filters.filter_expression.field,
165
+ self.kbid,
166
+ )
167
+ if self.item.filters.filter_expression.paragraph is not None:
168
+ filters.paragraph_expression = await parse_expression(
169
+ self.item.filters.filter_expression.paragraph,
170
+ self.kbid,
171
+ )
172
+ if self.item.filters.filter_expression.operator == FilterExpression.Operator.OR:
173
+ filter_operator = nodereader_pb2.FilterOperator.OR
174
+ else:
175
+ filter_operator = nodereader_pb2.FilterOperator.AND
176
+ filters.filter_expression_operator = filter_operator
177
+
178
+ filters.hidden = await filter_hidden_resources(self.kbid, self.item.filters.show_hidden)
179
+ filters.security = self.item.filters.security
180
+ filters.with_duplicates = self.item.filters.with_duplicates
181
+
182
+ return filters
183
+
184
+ def _parse_rank_fusion(self) -> RankFusion:
185
+ rank_fusion: RankFusion
186
+
187
+ top_k = self.item.top_k
188
+ window = min(top_k, MAX_RANK_FUSION_WINDOW)
189
+
190
+ if isinstance(self.item.rank_fusion, search_models.RankFusionName):
191
+ if self.item.rank_fusion == search_models.RankFusionName.RECIPROCAL_RANK_FUSION:
192
+ rank_fusion = ReciprocalRankFusion(window=window)
193
+ else:
194
+ raise InternalParserError(f"Unknown rank fusion algorithm: {self.item.rank_fusion}")
195
+
196
+ elif isinstance(self.item.rank_fusion, search_models.ReciprocalRankFusion):
197
+ user_window = self.item.rank_fusion.window
198
+ rank_fusion = ReciprocalRankFusion(
199
+ k=self.item.rank_fusion.k,
200
+ boosting=self.item.rank_fusion.boosting,
201
+ window=min(max(user_window or 0, top_k), 500),
202
+ )
203
+
204
+ else:
205
+ raise InternalParserError(f"Unknown rank fusion {self.item.rank_fusion}")
206
+
207
+ return rank_fusion
@@ -17,14 +17,12 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  from nidx_protos import nodereader_pb2
23
22
 
24
23
  from nucliadb.common.exceptions import InvalidQueryError
25
24
  from nucliadb.common.filter_expression import parse_expression
26
25
  from nucliadb.search.search.metrics import query_parser_observer
27
- from nucliadb.search.search.query import expand_entities
28
26
  from nucliadb.search.search.query_parser.fetcher import Fetcher
29
27
  from nucliadb.search.search.query_parser.models import (
30
28
  Filters,
@@ -60,9 +58,7 @@ INDEX_SORTABLE_FIELDS = [
60
58
 
61
59
 
62
60
  @query_parser_observer.wrap({"type": "parse_search"})
63
- async def parse_search(
64
- kbid: str, item: SearchRequest, *, fetcher: Optional[Fetcher] = None
65
- ) -> ParsedQuery:
61
+ async def parse_search(kbid: str, item: SearchRequest, *, fetcher: Fetcher | None = None) -> ParsedQuery:
66
62
  fetcher = fetcher or fetcher_for_search(kbid, item)
67
63
  parser = _SearchParser(kbid, item, fetcher)
68
64
  retrieval = await parser.parse()
@@ -89,14 +85,17 @@ class _SearchParser:
89
85
  self.fetcher = fetcher
90
86
 
91
87
  # cached data while parsing
92
- self._query: Optional[Query] = None
93
- self._top_k: Optional[int] = None
88
+ self._query: Query | None = None
89
+ self._top_k: int | None = None
94
90
 
95
91
  async def parse(self) -> UnitRetrieval:
96
92
  self._validate_request()
97
93
 
98
94
  self._top_k = parse_top_k(self.item)
99
95
 
96
+ if self._top_k > 0 and self.item.offset > 0:
97
+ self._top_k += self.item.offset
98
+
100
99
  # parse search types (features)
101
100
 
102
101
  self._query = Query()
@@ -149,67 +148,38 @@ class _SearchParser:
149
148
  assert self._top_k is not None, "top_k must be parsed before text query"
150
149
 
151
150
  keyword = await parse_keyword_query(self.item, fetcher=self.fetcher)
152
- sort, order_by, limit = self._parse_sorting()
151
+ sort, order_by = self._parse_sorting()
153
152
  keyword.sort = sort
154
153
  keyword.order_by = order_by
155
- if limit is not None:
156
- # sort limit can extend top_k
157
- self._top_k = max(self._top_k, limit)
154
+
158
155
  return keyword
159
156
 
160
157
  async def _parse_relation_query(self) -> RelationQuery:
161
158
  detected_entities = await self._get_detected_entities()
162
- deleted_entity_groups = await self.fetcher.get_deleted_entity_groups()
163
- meta_cache = await self.fetcher.get_entities_meta_cache()
164
- deleted_entities = meta_cache.deleted_entities
159
+
165
160
  return RelationQuery(
166
- entry_points=detected_entities,
167
- deleted_entity_groups=deleted_entity_groups,
168
- deleted_entities=deleted_entities,
161
+ entry_points=detected_entities, deleted_entity_groups=[], deleted_entities={}
169
162
  )
170
163
 
171
164
  async def _get_detected_entities(self) -> list[utils_pb2.RelationNode]:
172
165
  detected_entities = await self.fetcher.get_detected_entities()
173
- meta_cache = await self.fetcher.get_entities_meta_cache()
174
- detected_entities = expand_entities(meta_cache, detected_entities)
175
166
  return detected_entities
176
167
 
177
- def _parse_sorting(self) -> tuple[search_models.SortOrder, search_models.SortField, Optional[int]]:
168
+ def _parse_sorting(self) -> tuple[search_models.SortOrder, search_models.SortField]:
178
169
  sort = self.item.sort
179
- if len(self.item.query) == 0:
180
- if sort is None:
170
+ if sort is None:
171
+ if len(self.item.query) == 0:
181
172
  sort = SortOptions(
182
173
  field=SortField.CREATED,
183
174
  order=SortOrder.DESC,
184
- limit=None,
185
175
  )
186
- elif sort.field not in INDEX_SORTABLE_FIELDS:
187
- raise InvalidQueryError(
188
- "sort_field",
189
- f"Empty query can only be sorted by '{SortField.CREATED}' or"
190
- f" '{SortField.MODIFIED}' and sort limit won't be applied",
191
- )
192
- else:
193
- if sort is None:
176
+ else:
194
177
  sort = SortOptions(
195
178
  field=SortField.SCORE,
196
179
  order=SortOrder.DESC,
197
- limit=None,
198
180
  )
199
- elif sort.field not in INDEX_SORTABLE_FIELDS and sort.limit is None:
200
- raise InvalidQueryError(
201
- "sort_field",
202
- f"Sort by '{sort.field}' requires setting a sort limit",
203
- )
204
-
205
- # We need to ask for all and cut later
206
- top_k = None
207
- if sort and sort.limit is not None:
208
- # As the index can't sort, we have to do it when merging. To
209
- # have consistent results, we must limit them
210
- top_k = sort.limit
211
181
 
212
- return (sort.order, sort.field, top_k)
182
+ return (sort.order, sort.field)
213
183
 
214
184
  async def _parse_filters(self) -> Filters:
215
185
  assert self._query is not None, "query must be parsed before filters"
@@ -251,17 +221,9 @@ class _SearchParser:
251
221
  else:
252
222
  filter_operator = nodereader_pb2.FilterOperator.AND
253
223
 
254
- autofilter = None
255
- if self.item.autofilter:
256
- if self._query.relation is not None:
257
- autofilter = self._query.relation.entry_points
258
- else:
259
- autofilter = await self._get_detected_entities()
260
-
261
224
  hidden = await filter_hidden_resources(self.kbid, self.item.show_hidden)
262
225
 
263
226
  return Filters(
264
- autofilter=autofilter,
265
227
  facets=self.item.faceted,
266
228
  field_expression=field_expr,
267
229
  paragraph_expression=paragraph_expr,