nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,8 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
20
+
21
+ from typing_extensions import assert_never
21
22
 
22
23
  from nucliadb.search.search.query_parser.fetcher import Fetcher
23
24
  from nucliadb.search.search.query_parser.models import (
@@ -26,7 +27,7 @@ from nucliadb.search.search.query_parser.models import (
26
27
  from nucliadb_models.search import AskRequest, MaxTokens
27
28
 
28
29
 
29
- async def parse_ask(kbid: str, item: AskRequest, *, fetcher: Optional[Fetcher] = None) -> Generation:
30
+ async def parse_ask(kbid: str, item: AskRequest, *, fetcher: Fetcher | None = None) -> Generation:
30
31
  fetcher = fetcher or fetcher_for_ask(kbid, item)
31
32
  parser = _AskParser(kbid, item, fetcher)
32
33
  return await parser.parse()
@@ -63,10 +64,8 @@ class _AskParser:
63
64
  )
64
65
  elif isinstance(self.item.max_tokens, MaxTokens):
65
66
  max_tokens = self.item.max_tokens
66
- else: # pragma: nocover
67
- # This is a trick so mypy generates an error if this branch can be reached,
68
- # that is, if we are missing some ifs
69
- _a: int = "a"
67
+ else: # pragma: no cover
68
+ assert_never(self.item.max_tokens)
70
69
 
71
70
  max_context_tokens = await self.fetcher.get_max_context_tokens(max_tokens)
72
71
  max_answer_tokens = self.fetcher.get_max_answer_tokens(max_tokens)
@@ -18,14 +18,13 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
+ from typing_extensions import assert_never
22
+
21
23
  from nucliadb.common import datamanagers
24
+ from nucliadb.common.catalog.interface import CatalogExpression, CatalogQuery
22
25
  from nucliadb.common.exceptions import InvalidQueryError
23
- from nucliadb.common.filter_expression import FacetFilterTypes, facet_from_filter
26
+ from nucliadb.common.filter_expression import FacetFilter, facet_from_filter
24
27
  from nucliadb.search.search.filters import translate_label
25
- from nucliadb.search.search.query_parser.models import (
26
- CatalogExpression,
27
- CatalogQuery,
28
- )
29
28
  from nucliadb_models import search as search_models
30
29
  from nucliadb_models.filters import (
31
30
  And,
@@ -81,7 +80,6 @@ async def parse_catalog(kbid: str, item: search_models.CatalogRequest) -> Catalo
81
80
  sort = SortOptions(
82
81
  field=SortField.CREATED,
83
82
  order=SortOrder.DESC,
84
- limit=None,
85
83
  )
86
84
 
87
85
  if isinstance(item.query, search_models.CatalogQuery):
@@ -185,18 +183,16 @@ async def parse_filter_expression(expr: ResourceFilterExpression, kbid: str) ->
185
183
  if rid is None:
186
184
  raise InvalidQueryError("slug", f"Cannot find slug {expr.slug}")
187
185
  cat.resource_id = rid
188
- else: # pragma: nocover
186
+ else: # pragma: no cover
189
187
  # Cannot happen due to model validation
190
188
  raise ValueError("Resource needs id or slug")
191
189
  elif isinstance(expr, DateCreated):
192
190
  cat.date = CatalogExpression.Date(field="created_at", since=expr.since, until=expr.until)
193
191
  elif isinstance(expr, DateModified):
194
192
  cat.date = CatalogExpression.Date(field="modified_at", since=expr.since, until=expr.until)
195
- elif isinstance(expr, FacetFilterTypes):
193
+ elif isinstance(expr, FacetFilter):
196
194
  cat.facet = facet_from_filter(expr)
197
195
  else:
198
- # This is a trick so mypy generates an error if this branch can be reached,
199
- # that is, if we are missing some ifs
200
- _a: int = "a"
196
+ assert_never(expr)
201
197
 
202
198
  return cat
@@ -19,9 +19,7 @@
19
19
  #
20
20
  import re
21
21
  import string
22
- from typing import Optional, Union
23
22
 
24
- from nucliadb.common.exceptions import InvalidQueryError
25
23
  from nucliadb.search import logger
26
24
  from nucliadb.search.search.query_parser.fetcher import Fetcher
27
25
  from nucliadb.search.search.query_parser.models import (
@@ -32,15 +30,20 @@ from nucliadb_models import search as search_models
32
30
 
33
31
  DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
34
32
 
35
- # -* is an invalid query in tantivy and it won't return results but if you add some whitespaces
36
- # between - and *, it will actually trigger a tantivy bug and panic
37
- INVALID_QUERY = re.compile(r"- +\*")
38
33
 
34
+ def validate_query_syntax(query: str) -> str:
35
+ """Filter some queries that panic tantivy, better than returning the 500"""
39
36
 
40
- def validate_query_syntax(query: str):
41
- # Filter some queries that panic tantivy, better than returning the 500
37
+ # -* is an invalid query in tantivy and it won't return results but if you add some whitespaces
38
+ # between - and *, it will actually trigger a tantivy bug and panic
39
+ INVALID_QUERY = re.compile(r"- *\*+")
42
40
  if INVALID_QUERY.search(query):
43
- raise InvalidQueryError("query", "Invalid query syntax")
41
+ # remove the * and extra spaces, as it's probably what doesn't have
42
+ # meaning in both cases: -* and - *
43
+ fixed = re.sub(INVALID_QUERY, "- ", query)
44
+ query = fixed
45
+
46
+ return query
44
47
 
45
48
 
46
49
  def is_empty_query(request: search_models.BaseSearchRequest) -> bool:
@@ -85,6 +88,7 @@ async def parse_keyword_query(
85
88
  fetcher: Fetcher,
86
89
  ) -> KeywordQuery:
87
90
  query = item.query
91
+
88
92
  # If there was a rephrase with image, we should use the rephrased query for keyword search
89
93
  rephrased_query = await fetcher.get_rephrased_query()
90
94
  if item.query_image is not None and rephrased_query is not None:
@@ -98,6 +102,10 @@ async def parse_keyword_query(
98
102
  query = synonyms_query
99
103
  is_synonyms_query = True
100
104
 
105
+ # after all query transformations, pass a validator that can fix some
106
+ # queries that trigger a panic on the index
107
+ query = validate_query_syntax(query)
108
+
101
109
  min_score = parse_keyword_min_score(item.min_score)
102
110
 
103
111
  return KeywordQuery(
@@ -108,7 +116,7 @@ async def parse_keyword_query(
108
116
 
109
117
 
110
118
  async def parse_semantic_query(
111
- item: Union[search_models.SearchRequest, search_models.FindRequest],
119
+ item: search_models.SearchRequest | search_models.FindRequest,
112
120
  *,
113
121
  fetcher: Fetcher,
114
122
  ) -> SemanticQuery:
@@ -121,7 +129,7 @@ async def parse_semantic_query(
121
129
 
122
130
 
123
131
  def parse_keyword_min_score(
124
- min_score: Optional[Union[float, search_models.MinScore]],
132
+ min_score: float | search_models.MinScore | None,
125
133
  ) -> float:
126
134
  # Keep backward compatibility with the deprecated min_score payload
127
135
  # parameter being a float (specifying semantic)
@@ -132,7 +140,7 @@ def parse_keyword_min_score(
132
140
 
133
141
 
134
142
  async def parse_semantic_min_score(
135
- min_score: Optional[Union[float, search_models.MinScore]],
143
+ min_score: float | search_models.MinScore | None,
136
144
  *,
137
145
  fetcher: Fetcher,
138
146
  ):
@@ -161,7 +169,7 @@ async def query_with_synonyms(
161
169
  query: str,
162
170
  *,
163
171
  fetcher: Fetcher,
164
- ) -> Optional[str]:
172
+ ) -> str | None:
165
173
  """
166
174
  Replace the terms in the query with an expression that will make it match with the configured synonyms.
167
175
  We're using the Tantivy's query language here: https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html
@@ -183,7 +191,7 @@ async def query_with_synonyms(
183
191
  variants: dict[str, str] = {}
184
192
  for term, term_synonyms in synonyms.terms.items():
185
193
  if len(term_synonyms.synonyms) > 0:
186
- variants[term] = "({})".format(" OR ".join([term] + list(term_synonyms.synonyms)))
194
+ variants[term] = "({})".format(" OR ".join([term, *list(term_synonyms.synonyms)]))
187
195
 
188
196
  # Split the query into terms
189
197
  query_terms = query.split()
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional
22
21
 
23
22
  from nidx_protos import nodereader_pb2
24
23
  from pydantic import ValidationError
@@ -27,7 +26,6 @@ from nucliadb.common.exceptions import InvalidQueryError
27
26
  from nucliadb.common.filter_expression import parse_expression
28
27
  from nucliadb.common.models_utils.from_proto import RelationNodeTypeMap
29
28
  from nucliadb.search.search.metrics import query_parser_observer
30
- from nucliadb.search.search.query import expand_entities
31
29
  from nucliadb.search.search.query_parser.exceptions import InternalParserError
32
30
  from nucliadb.search.search.query_parser.fetcher import Fetcher
33
31
  from nucliadb.search.search.query_parser.models import (
@@ -57,7 +55,6 @@ from .common import (
57
55
  parse_semantic_query,
58
56
  parse_top_k,
59
57
  should_disable_vector_search,
60
- validate_query_syntax,
61
58
  )
62
59
 
63
60
 
@@ -66,7 +63,7 @@ async def parse_find(
66
63
  kbid: str,
67
64
  item: FindRequest,
68
65
  *,
69
- fetcher: Optional[Fetcher] = None,
66
+ fetcher: Fetcher | None = None,
70
67
  ) -> ParsedQuery:
71
68
  fetcher = fetcher or fetcher_for_find(kbid, item)
72
69
  parser = _FindParser(kbid, item, fetcher)
@@ -94,8 +91,8 @@ class _FindParser:
94
91
  self.fetcher = fetcher
95
92
 
96
93
  # cached data while parsing
97
- self._query: Optional[Query] = None
98
- self._top_k: Optional[int] = None
94
+ self._query: Query | None = None
95
+ self._top_k: int | None = None
99
96
 
100
97
  async def parse(self) -> UnitRetrieval:
101
98
  self._validate_request()
@@ -123,11 +120,11 @@ class _FindParser:
123
120
  try:
124
121
  rank_fusion = self._parse_rank_fusion()
125
122
  except ValidationError as exc:
126
- raise InternalParserError(f"Parsing error in rank fusion: {str(exc)}") from exc
123
+ raise InternalParserError(f"Parsing error in rank fusion: {exc!s}") from exc
127
124
  try:
128
125
  reranker = self._parse_reranker()
129
126
  except ValidationError as exc:
130
- raise InternalParserError(f"Parsing error in reranker: {str(exc)}") from exc
127
+ raise InternalParserError(f"Parsing error in reranker: {exc!s}") from exc
131
128
 
132
129
  # Adjust retrieval windows. Our current implementation assume:
133
130
  # `top_k <= reranker.window <= rank_fusion.window`
@@ -146,8 +143,6 @@ class _FindParser:
146
143
  return retrieval
147
144
 
148
145
  def _validate_request(self):
149
- validate_query_syntax(self.item.query)
150
-
151
146
  # synonyms are not compatible with vector/graph search
152
147
  if (
153
148
  self.item.with_synonyms
@@ -173,15 +168,8 @@ class _FindParser:
173
168
  async def _parse_relation_query(self) -> RelationQuery:
174
169
  detected_entities = await self._get_detected_entities()
175
170
 
176
- deleted_entity_groups = await self.fetcher.get_deleted_entity_groups()
177
-
178
- meta_cache = await self.fetcher.get_entities_meta_cache()
179
- deleted_entities = meta_cache.deleted_entities
180
-
181
171
  return RelationQuery(
182
- entry_points=detected_entities,
183
- deleted_entity_groups=deleted_entity_groups,
184
- deleted_entities=deleted_entities,
172
+ entry_points=detected_entities, deleted_entity_groups=[], deleted_entities={}
185
173
  )
186
174
 
187
175
  async def _parse_graph_query(self) -> GraphQuery:
@@ -208,9 +196,6 @@ class _FindParser:
208
196
  else:
209
197
  detected_entities = await self.fetcher.get_detected_entities()
210
198
 
211
- meta_cache = await self.fetcher.get_entities_meta_cache()
212
- detected_entities = expand_entities(meta_cache, detected_entities)
213
-
214
199
  return detected_entities
215
200
 
216
201
  async def _parse_filters(self) -> Filters:
@@ -256,17 +241,9 @@ class _FindParser:
256
241
  else:
257
242
  filter_operator = nodereader_pb2.FilterOperator.AND
258
243
 
259
- autofilter = None
260
- if self.item.autofilter:
261
- if self._query.relation is not None:
262
- autofilter = self._query.relation.entry_points
263
- else:
264
- autofilter = await self._get_detected_entities()
265
-
266
244
  hidden = await filter_hidden_resources(self.kbid, self.item.show_hidden)
267
245
 
268
246
  return Filters(
269
- autofilter=autofilter,
270
247
  facets=[],
271
248
  field_expression=field_expr,
272
249
  paragraph_expression=paragraph_expr,
@@ -18,9 +18,9 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional, Union
22
21
 
23
22
  from nidx_protos import nodereader_pb2
23
+ from typing_extensions import assert_never
24
24
 
25
25
  from nucliadb.common.filter_expression import add_and_expression, parse_expression
26
26
  from nucliadb.common.models_utils.from_proto import RelationNodeTypeMap, RelationTypeMap
@@ -56,11 +56,11 @@ async def parse_graph_relation_search(
56
56
  return pb
57
57
 
58
58
 
59
- AnyGraphRequest = Union[
60
- graph_requests.GraphSearchRequest,
61
- graph_requests.GraphNodesSearchRequest,
62
- graph_requests.GraphRelationsSearchRequest,
63
- ]
59
+ AnyGraphRequest = (
60
+ graph_requests.GraphSearchRequest
61
+ | graph_requests.GraphNodesSearchRequest
62
+ | graph_requests.GraphRelationsSearchRequest
63
+ )
64
64
 
65
65
 
66
66
  async def _parse_common(kbid: str, item: AnyGraphRequest) -> nodereader_pb2.GraphSearchRequest:
@@ -78,7 +78,7 @@ async def _parse_common(kbid: str, item: AnyGraphRequest) -> nodereader_pb2.Grap
78
78
  return pb
79
79
 
80
80
 
81
- async def _parse_filters(kbid: str, item: AnyGraphRequest) -> Optional[nodereader_pb2.FilterExpression]:
81
+ async def _parse_filters(kbid: str, item: AnyGraphRequest) -> nodereader_pb2.FilterExpression | None:
82
82
  filter_expr = nodereader_pb2.FilterExpression()
83
83
  if item.filter_expression:
84
84
  if item.filter_expression.field:
@@ -100,7 +100,7 @@ async def _parse_filters(kbid: str, item: AnyGraphRequest) -> Optional[nodereade
100
100
  return None
101
101
 
102
102
 
103
- def _parse_security(kbid: str, item: AnyGraphRequest) -> Optional[utils_pb2.Security]:
103
+ def _parse_security(kbid: str, item: AnyGraphRequest) -> utils_pb2.Security | None:
104
104
  if item.security is not None and len(item.security.groups) > 0:
105
105
  security_pb = utils_pb2.Security()
106
106
  for group_id in item.security.groups:
@@ -153,10 +153,8 @@ def parse_path_query(expr: graph_requests.GraphPathQuery) -> nodereader_pb2.Grap
153
153
  elif isinstance(expr, graph_requests.Generated):
154
154
  _set_generated_to_pb(expr, pb)
155
155
 
156
- else: # pragma: nocover
157
- # This is a trick so mypy generates an error if this branch can be reached,
158
- # that is, if we are missing some ifs
159
- _a: int = "a"
156
+ else: # pragma: no cover
157
+ assert_never(expr)
160
158
 
161
159
  return pb
162
160
 
@@ -182,10 +180,8 @@ def _parse_node_query(expr: graph_requests.GraphNodesQuery) -> nodereader_pb2.Gr
182
180
  elif isinstance(expr, graph_requests.Generated):
183
181
  _set_generated_to_pb(expr, pb)
184
182
 
185
- else: # pragma: nocover
186
- # This is a trick so mypy generates an error if this branch can be reached,
187
- # that is, if we are missing some ifs
188
- _a: int = "a"
183
+ else: # pragma: no cover
184
+ assert_never(expr)
189
185
 
190
186
  return pb
191
187
 
@@ -212,10 +208,8 @@ def _parse_relation_query(
212
208
  elif isinstance(expr, graph_requests.Generated):
213
209
  _set_generated_to_pb(expr, pb)
214
210
 
215
- else: # pragma: nocover
216
- # This is a trick so mypy generates an error if this branch can be reached,
217
- # that is, if we are missing some ifs
218
- _a: int = "a"
211
+ else: # pragma: no cover
212
+ assert_never(expr)
219
213
 
220
214
  return pb
221
215
 
@@ -230,10 +224,8 @@ def _set_node_to_pb(node: graph_requests.GraphNode, pb: nodereader_pb2.GraphQuer
230
224
  pb.fuzzy.kind = nodereader_pb2.GraphQuery.Node.MatchLocation.PREFIX
231
225
  pb.fuzzy.distance = 1
232
226
 
233
- else: # pragma: nocover
234
- # This is a trick so mypy generates an error if this branch can be reached,
235
- # that is, if we are missing some ifs
236
- _a: int = "a"
227
+ else: # pragma: no cover
228
+ assert_never(node.match)
237
229
 
238
230
  if node.type is not None:
239
231
  pb.node_type = RelationNodeTypeMap[node.type]
@@ -263,7 +255,5 @@ def _set_generated_to_pb(generated: graph_requests.Generated, pb: nodereader_pb2
263
255
 
264
256
  pb.facet.facet = facet
265
257
 
266
- else: # pragma: nocover
267
- # This is a trick so mypy generates an error if this branch can be reached,
268
- # that is, if we are missing some ifs
269
- _a: int = "a"
258
+ else: # pragma: no cover
259
+ assert_never(generated.by)
@@ -0,0 +1,207 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from nidx_protos import nodereader_pb2
21
+ from pydantic import ValidationError
22
+
23
+ from nucliadb.common.exceptions import InvalidQueryError
24
+ from nucliadb.common.filter_expression import parse_expression
25
+ from nucliadb.search.search.metrics import query_parser_observer
26
+ from nucliadb.search.search.query_parser.exceptions import InternalParserError
27
+ from nucliadb.search.search.query_parser.fetcher import Fetcher
28
+ from nucliadb.search.search.query_parser.models import (
29
+ Filters,
30
+ GraphQuery,
31
+ KeywordQuery,
32
+ Query,
33
+ RankFusion,
34
+ ReciprocalRankFusion,
35
+ SemanticQuery,
36
+ UnitRetrieval,
37
+ )
38
+ from nucliadb.search.search.query_parser.parsers.common import query_with_synonyms, validate_query_syntax
39
+ from nucliadb.search.search.utils import filter_hidden_resources
40
+ from nucliadb_models import search as search_models
41
+ from nucliadb_models.filters import FilterExpression
42
+ from nucliadb_models.retrieval import RetrievalRequest
43
+ from nucliadb_models.search import MAX_RANK_FUSION_WINDOW
44
+
45
+
46
+ @query_parser_observer.wrap({"type": "parse_retrieve"})
47
+ async def parse_retrieve(kbid: str, item: RetrievalRequest) -> UnitRetrieval:
48
+ fetcher = Fetcher(
49
+ kbid=kbid,
50
+ query=item.query.keyword.query if item.query.keyword else "",
51
+ user_vector=item.query.semantic.query if item.query.semantic else None,
52
+ vectorset=item.query.semantic.vectorset if item.query.semantic else None,
53
+ # Retrieve doesn't use images for now
54
+ query_image=None,
55
+ # Retrieve doesn't do rephrasing
56
+ rephrase=False,
57
+ rephrase_prompt=None,
58
+ generative_model=None,
59
+ )
60
+ parser = _RetrievalParser(kbid, item, fetcher)
61
+ retrieval = await parser.parse()
62
+ return retrieval
63
+
64
+
65
+ class _RetrievalParser:
66
+ def __init__(self, kbid: str, item: RetrievalRequest, fetcher: Fetcher):
67
+ self.kbid = kbid
68
+ self.item = item
69
+ self.fetcher = fetcher
70
+
71
+ async def parse(self) -> UnitRetrieval:
72
+ top_k = self.item.top_k
73
+ query = await self._parse_query()
74
+ filters = await self._parse_filters()
75
+ try:
76
+ rank_fusion = self._parse_rank_fusion()
77
+ except ValidationError as exc:
78
+ raise InternalParserError(f"Parsing error in rank fusion: {exc!s}") from exc
79
+
80
+ # ensure top_k and rank_fusion are coherent
81
+ if top_k > rank_fusion.window:
82
+ raise InvalidQueryError(
83
+ "rank_fusion.window", "Rank fusion window must be greater or equal to top_k"
84
+ )
85
+
86
+ retrieval = UnitRetrieval(
87
+ query=query,
88
+ top_k=top_k,
89
+ filters=filters,
90
+ rank_fusion=rank_fusion,
91
+ reranker=None,
92
+ )
93
+ return retrieval
94
+
95
+ async def _parse_query(self) -> Query:
96
+ keyword = None
97
+ if self.item.query.keyword is not None:
98
+ keyword_query, is_synonyms_query = await self._parse_keyword_query()
99
+ keyword = KeywordQuery(
100
+ query=keyword_query,
101
+ is_synonyms_query=is_synonyms_query,
102
+ min_score=self.item.query.keyword.min_score,
103
+ )
104
+
105
+ semantic = None
106
+ if self.item.query.semantic is not None:
107
+ vectorset, query_vector = await self._parse_semantic_query()
108
+ semantic = SemanticQuery(
109
+ query=query_vector,
110
+ vectorset=vectorset,
111
+ min_score=self.item.query.semantic.min_score,
112
+ )
113
+
114
+ graph = None
115
+ if self.item.query.graph is not None:
116
+ graph = GraphQuery(query=self.item.query.graph.query)
117
+
118
+ return Query(keyword=keyword, semantic=semantic, graph=graph)
119
+
120
+ async def _parse_keyword_query(self) -> tuple[str, bool]:
121
+ assert self.item.query.keyword is not None
122
+ keyword_query = self.item.query.keyword.query
123
+ is_synonyms_query = False
124
+ if self.item.query.keyword.with_synonyms:
125
+ synonyms_query = await query_with_synonyms(keyword_query, fetcher=self.fetcher)
126
+ if synonyms_query is not None:
127
+ keyword_query = synonyms_query
128
+ is_synonyms_query = True
129
+
130
+ # after all query transformations, pass a validator that can fix some
131
+ # queries that trigger a panic on the index
132
+ keyword_query = validate_query_syntax(keyword_query)
133
+ return keyword_query, is_synonyms_query
134
+
135
+ async def _parse_semantic_query(self) -> tuple[str, list[float]]:
136
+ # Make sure the vectorset exists in the KB
137
+ assert self.item.query.semantic is not None
138
+ vectorset = self.item.query.semantic.vectorset
139
+ await self.fetcher.validate_vectorset(self.kbid, vectorset)
140
+
141
+ # Calculate the matryoshka dimension if applicable
142
+ user_vector = self.item.query.semantic.query
143
+ matryoshka_dimension = await self.fetcher.get_matryoshka_dimension_cached(self.kbid, vectorset)
144
+ if matryoshka_dimension is not None:
145
+ if len(user_vector) < matryoshka_dimension:
146
+ raise InvalidQueryError(
147
+ "vector",
148
+ f"Invalid vector length, please check valid embedding size for {vectorset} model",
149
+ )
150
+
151
+ # KB using a matryoshka embeddings model, cut the query vector
152
+ # accordingly
153
+ query_vector = user_vector[:matryoshka_dimension]
154
+ return vectorset, query_vector
155
+
156
+ async def _parse_filters(self) -> Filters:
157
+ filters = Filters()
158
+ if self.item.filters is None:
159
+ return filters
160
+
161
+ if self.item.filters.filter_expression is not None:
162
+ if self.item.filters.filter_expression.field is not None:
163
+ filters.field_expression = await parse_expression(
164
+ self.item.filters.filter_expression.field,
165
+ self.kbid,
166
+ )
167
+ if self.item.filters.filter_expression.paragraph is not None:
168
+ filters.paragraph_expression = await parse_expression(
169
+ self.item.filters.filter_expression.paragraph,
170
+ self.kbid,
171
+ )
172
+ if self.item.filters.filter_expression.operator == FilterExpression.Operator.OR:
173
+ filter_operator = nodereader_pb2.FilterOperator.OR
174
+ else:
175
+ filter_operator = nodereader_pb2.FilterOperator.AND
176
+ filters.filter_expression_operator = filter_operator
177
+
178
+ filters.hidden = await filter_hidden_resources(self.kbid, self.item.filters.show_hidden)
179
+ filters.security = self.item.filters.security
180
+ filters.with_duplicates = self.item.filters.with_duplicates
181
+
182
+ return filters
183
+
184
+ def _parse_rank_fusion(self) -> RankFusion:
185
+ rank_fusion: RankFusion
186
+
187
+ top_k = self.item.top_k
188
+ window = min(top_k, MAX_RANK_FUSION_WINDOW)
189
+
190
+ if isinstance(self.item.rank_fusion, search_models.RankFusionName):
191
+ if self.item.rank_fusion == search_models.RankFusionName.RECIPROCAL_RANK_FUSION:
192
+ rank_fusion = ReciprocalRankFusion(window=window)
193
+ else:
194
+ raise InternalParserError(f"Unknown rank fusion algorithm: {self.item.rank_fusion}")
195
+
196
+ elif isinstance(self.item.rank_fusion, search_models.ReciprocalRankFusion):
197
+ user_window = self.item.rank_fusion.window
198
+ rank_fusion = ReciprocalRankFusion(
199
+ k=self.item.rank_fusion.k,
200
+ boosting=self.item.rank_fusion.boosting,
201
+ window=min(max(user_window or 0, top_k), 500),
202
+ )
203
+
204
+ else:
205
+ raise InternalParserError(f"Unknown rank fusion {self.item.rank_fusion}")
206
+
207
+ return rank_fusion