nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -19,7 +19,6 @@
19
19
  #
20
20
  import json
21
21
  from time import time
22
- from typing import Optional, Union
23
22
 
24
23
  from fastapi import Body, Header, Query, Request, Response
25
24
  from fastapi.openapi.models import Example
@@ -37,7 +36,10 @@ from nucliadb.search.requesters.utils import Method, nidx_query
37
36
  from nucliadb.search.search import cache
38
37
  from nucliadb.search.search.merge import merge_results
39
38
  from nucliadb.search.search.query_parser.parsers.search import parse_search
40
- from nucliadb.search.search.query_parser.parsers.unit_retrieval import legacy_convert_retrieval_to_proto
39
+ from nucliadb.search.search.query_parser.parsers.unit_retrieval import (
40
+ convert_retrieval_to_proto,
41
+ is_incomplete,
42
+ )
41
43
  from nucliadb.search.search.utils import (
42
44
  min_score_from_query_params,
43
45
  )
@@ -65,7 +67,7 @@ from nucliadb_utils.utilities import get_audit
65
67
  SEARCH_EXAMPLES = {
66
68
  "filtering_by_icon": Example(
67
69
  summary="Search for pdf documents where the text 'Noam Chomsky' appears",
68
- description="For a complete list of filters, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets", # noqa
70
+ description="For a complete list of filters, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets",
69
71
  value={
70
72
  "query": "Noam Chomsky",
71
73
  "filters": ["/icon/application/pdf"],
@@ -74,7 +76,7 @@ SEARCH_EXAMPLES = {
74
76
  ),
75
77
  "get_language_counts": Example(
76
78
  summary="Get the number of documents for each language",
77
- description="For a complete list of facets, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets", # noqa
79
+ description="For a complete list of facets, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets",
78
80
  value={
79
81
  "page_size": 0,
80
82
  "faceted": ["/s/p"],
@@ -88,7 +90,7 @@ SEARCH_EXAMPLES = {
88
90
  f"/{KB_PREFIX}/{{kbid}}/search",
89
91
  status_code=200,
90
92
  summary="Search Knowledge Box",
91
- description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`", # noqa: E501
93
+ description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`",
92
94
  response_model=KnowledgeboxSearchResults,
93
95
  response_model_exclude_unset=True,
94
96
  tags=["Search"],
@@ -100,37 +102,35 @@ async def search_knowledgebox(
100
102
  response: Response,
101
103
  kbid: str,
102
104
  query: str = fastapi_query(SearchParamDefaults.query),
103
- filter_expression: Optional[str] = fastapi_query(SearchParamDefaults.filter_expression),
105
+ filter_expression: str | None = fastapi_query(SearchParamDefaults.filter_expression),
104
106
  fields: list[str] = fastapi_query(SearchParamDefaults.fields),
105
107
  filters: list[str] = fastapi_query(SearchParamDefaults.filters),
106
108
  faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
107
109
  sort_field: SortField = fastapi_query(SearchParamDefaults.sort_field),
108
- sort_limit: Optional[int] = fastapi_query(SearchParamDefaults.sort_limit),
109
110
  sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
110
111
  top_k: int = fastapi_query(SearchParamDefaults.top_k),
111
- min_score: Optional[float] = Query(
112
+ offset: int = fastapi_query(SearchParamDefaults.offset),
113
+ min_score: float | None = Query(
112
114
  default=None,
113
- description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
115
+ description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
114
116
  deprecated=True,
115
117
  ),
116
- min_score_semantic: Optional[float] = Query(
118
+ min_score_semantic: float | None = Query(
117
119
  default=None,
118
- description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
120
+ description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
119
121
  ),
120
122
  min_score_bm25: float = Query(
121
123
  default=0,
122
124
  description="Minimum bm25 score to filter paragraph and document index results",
123
125
  ge=0,
124
126
  ),
125
- vectorset: Optional[str] = fastapi_query(SearchParamDefaults.vectorset),
126
- range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
127
- range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
128
- range_modification_start: Optional[DateTime] = fastapi_query(
127
+ vectorset: str | None = fastapi_query(SearchParamDefaults.vectorset),
128
+ range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
129
+ range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
130
+ range_modification_start: DateTime | None = fastapi_query(
129
131
  SearchParamDefaults.range_modification_start
130
132
  ),
131
- range_modification_end: Optional[DateTime] = fastapi_query(
132
- SearchParamDefaults.range_modification_end
133
- ),
133
+ range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
134
134
  features: list[SearchOptions] = fastapi_query(
135
135
  SearchParamDefaults.search_features,
136
136
  default=[
@@ -148,13 +148,12 @@ async def search_knowledgebox(
148
148
  extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
149
149
  with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
150
150
  with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
151
- autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
152
151
  security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
153
152
  show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
154
153
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
155
154
  x_nucliadb_user: str = Header(""),
156
155
  x_forwarded_for: str = Header(""),
157
- ) -> Union[KnowledgeboxSearchResults, HTTPClientError]:
156
+ ) -> KnowledgeboxSearchResults | HTTPClientError:
158
157
  try:
159
158
  expr = FilterExpression.model_validate_json(filter_expression) if filter_expression else None
160
159
 
@@ -167,11 +166,7 @@ async def search_knowledgebox(
167
166
  fields=fields,
168
167
  filters=filters,
169
168
  faceted=faceted,
170
- sort=(
171
- SortOptions(field=sort_field, limit=sort_limit, order=sort_order)
172
- if sort_field is not None
173
- else None
174
- ),
169
+ sort=(SortOptions(field=sort_field, order=sort_order) if sort_field is not None else None),
175
170
  top_k=top_k,
176
171
  min_score=min_score_from_query_params(min_score_bm25, min_score_semantic, min_score),
177
172
  vectorset=vectorset,
@@ -187,9 +182,9 @@ async def search_knowledgebox(
187
182
  extracted=extracted,
188
183
  with_duplicates=with_duplicates,
189
184
  with_synonyms=with_synonyms,
190
- autofilter=autofilter,
191
185
  security=security,
192
186
  show_hidden=show_hidden,
187
+ offset=offset,
193
188
  )
194
189
  except ValidationError as exc:
195
190
  detail = json.loads(exc.json())
@@ -201,7 +196,7 @@ async def search_knowledgebox(
201
196
  f"/{KB_PREFIX}/{{kbid}}/search",
202
197
  status_code=200,
203
198
  summary="Search Knowledge Box",
204
- description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`", # noqa: E501
199
+ description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`",
205
200
  response_model=KnowledgeboxSearchResults,
206
201
  response_model_exclude_unset=True,
207
202
  tags=["Search"],
@@ -216,7 +211,7 @@ async def search_post_knowledgebox(
216
211
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
217
212
  x_nucliadb_user: str = Header(""),
218
213
  x_forwarded_for: str = Header(""),
219
- ) -> Union[KnowledgeboxSearchResults, HTTPClientError]:
214
+ ) -> KnowledgeboxSearchResults | HTTPClientError:
220
215
  return await _search_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
221
216
 
222
217
 
@@ -228,7 +223,7 @@ async def _search_endpoint(
228
223
  x_nucliadb_user: str,
229
224
  x_forwarded_for: str,
230
225
  **kwargs,
231
- ) -> Union[KnowledgeboxSearchResults, HTTPClientError]:
226
+ ) -> KnowledgeboxSearchResults | HTTPClientError:
232
227
  try:
233
228
  with cache.request_caches():
234
229
  results, incomplete = await search(
@@ -256,13 +251,14 @@ async def search(
256
251
  x_nucliadb_user: str,
257
252
  x_forwarded_for: str,
258
253
  do_audit: bool = True,
259
- with_status: Optional[ResourceProcessingStatus] = None,
254
+ with_status: ResourceProcessingStatus | None = None,
260
255
  ) -> tuple[KnowledgeboxSearchResults, bool]:
261
256
  audit = get_audit()
262
257
  start_time = time()
263
258
 
264
259
  parsed = await parse_search(kbid, item)
265
- pb_query, incomplete_results, autofilters, _ = await legacy_convert_retrieval_to_proto(parsed)
260
+ incomplete_results = is_incomplete(parsed.retrieval)
261
+ pb_query = convert_retrieval_to_proto(parsed.retrieval)
266
262
 
267
263
  # We need to query all nodes
268
264
  results, queried_shards = await nidx_query(kbid, Method.SEARCH, pb_query)
@@ -276,6 +272,7 @@ async def search(
276
272
  field_type_filter=item.field_type_filter,
277
273
  extracted=item.extracted,
278
274
  highlight=item.highlight,
275
+ offset=item.offset,
279
276
  )
280
277
 
281
278
  if audit is not None and do_audit:
@@ -290,5 +287,4 @@ async def search(
290
287
  )
291
288
 
292
289
  search_results.shards = queried_shards
293
- search_results.autofilters = autofilters
294
290
  return search_results, incomplete_results
@@ -19,7 +19,6 @@
19
19
  #
20
20
  import json
21
21
  from datetime import datetime
22
- from typing import Optional, Union
23
22
 
24
23
  from fastapi import Header, Request, Response
25
24
  from fastapi_versioning import version
@@ -64,20 +63,18 @@ async def suggest_knowledgebox(
64
63
  response: Response,
65
64
  kbid: str,
66
65
  query: str = fastapi_query(SearchParamDefaults.suggest_query),
67
- filter_expression: Optional[str] = fastapi_query(
66
+ filter_expression: str | None = fastapi_query(
68
67
  SearchParamDefaults.filter_expression, include_in_schema=False
69
68
  ),
70
69
  fields: list[str] = fastapi_query(SearchParamDefaults.fields),
71
70
  filters: list[str] = fastapi_query(SearchParamDefaults.filters),
72
71
  faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
73
- range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
74
- range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
75
- range_modification_start: Optional[DateTime] = fastapi_query(
72
+ range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
73
+ range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
74
+ range_modification_start: DateTime | None = fastapi_query(
76
75
  SearchParamDefaults.range_modification_start
77
76
  ),
78
- range_modification_end: Optional[DateTime] = fastapi_query(
79
- SearchParamDefaults.range_modification_end
80
- ),
77
+ range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
81
78
  features: list[SuggestOptions] = fastapi_query(SearchParamDefaults.suggest_features),
82
79
  show: list[ResourceProperties] = fastapi_query(SearchParamDefaults.show),
83
80
  field_type_filter: list[FieldTypeName] = fastapi_query(
@@ -89,7 +86,7 @@ async def suggest_knowledgebox(
89
86
  debug: bool = fastapi_query(SearchParamDefaults.debug),
90
87
  highlight: bool = fastapi_query(SearchParamDefaults.highlight),
91
88
  show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
92
- ) -> Union[KnowledgeboxSuggestResults, HTTPClientError]:
89
+ ) -> KnowledgeboxSuggestResults | HTTPClientError:
93
90
  try:
94
91
  expr = FilterExpression.model_validate_json(filter_expression) if filter_expression else None
95
92
 
@@ -126,14 +123,14 @@ async def suggest(
126
123
  response,
127
124
  kbid: str,
128
125
  query: str,
129
- filter_expression: Optional[FilterExpression],
126
+ filter_expression: FilterExpression | None,
130
127
  fields: list[str],
131
128
  filters: list[str],
132
129
  faceted: list[str],
133
- range_creation_start: Optional[datetime],
134
- range_creation_end: Optional[datetime],
135
- range_modification_start: Optional[datetime],
136
- range_modification_end: Optional[datetime],
130
+ range_creation_start: datetime | None,
131
+ range_creation_end: datetime | None,
132
+ range_modification_start: datetime | None,
133
+ range_modification_end: datetime | None,
137
134
  features: list[SuggestOptions],
138
135
  show: list[ResourceProperties],
139
136
  field_type_filter: list[FieldTypeName],
@@ -17,7 +17,6 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Union
21
20
 
22
21
  from fastapi import Header, Request
23
22
  from fastapi_versioning import version
@@ -48,7 +47,7 @@ async def summarize_endpoint(
48
47
  kbid: str,
49
48
  item: SummarizeRequest,
50
49
  x_show_consumption: bool = Header(default=False),
51
- ) -> Union[SummarizedResponse, HTTPClientError]:
50
+ ) -> SummarizedResponse | HTTPClientError:
52
51
  try:
53
52
  return await summarize(
54
53
  kbid=kbid,
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Any, Optional
20
+ from typing import Any
21
21
 
22
22
  from fastapi import Query
23
23
 
@@ -26,7 +26,7 @@ from nucliadb_models.search import ParamDefault
26
26
  _NOT_SET = object()
27
27
 
28
28
 
29
- def fastapi_query(param: ParamDefault, default: Optional[Any] = _NOT_SET, **kw) -> Query: # type: ignore
29
+ def fastapi_query(param: ParamDefault, default: Any | None = _NOT_SET, **kw) -> Query: # type: ignore
30
30
  # Be able to override default value
31
31
  if default is _NOT_SET:
32
32
  default_value = param.default
nucliadb/search/app.py CHANGED
@@ -26,7 +26,7 @@ from starlette.middleware.authentication import AuthenticationMiddleware
26
26
  from starlette.requests import ClientDisconnect, Request
27
27
  from starlette.responses import HTMLResponse
28
28
 
29
- from nucliadb.middleware import ProcessTimeHeaderMiddleware
29
+ from nucliadb.middleware import ClientErrorPayloadLoggerMiddleware, ProcessTimeHeaderMiddleware
30
30
  from nucliadb.search import API_PREFIX
31
31
  from nucliadb.search.api.v1.router import api as api_v1
32
32
  from nucliadb.search.lifecycle import lifespan
@@ -47,6 +47,7 @@ middleware.extend(
47
47
  [
48
48
  Middleware(AuthenticationMiddleware, backend=NucliaCloudAuthenticationBackend()),
49
49
  Middleware(AuditMiddleware, audit_utility_getter=get_audit),
50
+ Middleware(ClientErrorPayloadLoggerMiddleware),
50
51
  ]
51
52
  )
52
53
 
@@ -58,7 +59,6 @@ errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
58
59
 
59
60
  fastapi_settings = dict(
60
61
  debug=running_settings.debug,
61
- middleware=middleware,
62
62
  lifespan=lifespan,
63
63
  exception_handlers={
64
64
  Exception: global_exception_handler,
@@ -78,6 +78,7 @@ application = VersionedFastAPI(
78
78
  prefix_format=f"/{API_PREFIX}/v{{major}}",
79
79
  default_version=(1, 0),
80
80
  enable_latest=False,
81
+ middleware=middleware,
81
82
  kwargs=fastapi_settings,
82
83
  )
83
84
 
@@ -0,0 +1,21 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from . import fields, paragraphs, resources # noqa: F401
21
+ from .augmentor import augment # noqa: F401
@@ -0,0 +1,232 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ from typing import Any
22
+
23
+ from typing_extensions import assert_never
24
+
25
+ import nucliadb_models
26
+ from nucliadb.common import datamanagers
27
+ from nucliadb.common.ids import FIELD_TYPE_NAME_TO_STR, FieldId, ParagraphId
28
+ from nucliadb.models.internal.augment import (
29
+ Augment,
30
+ Augmented,
31
+ AugmentedField,
32
+ AugmentedParagraph,
33
+ AugmentedResource,
34
+ )
35
+ from nucliadb.search.augmentor.utils import limited_concurrency
36
+ from nucliadb.search.search.hydrator import ResourceHydrationOptions
37
+ from nucliadb_models.common import FieldTypeName
38
+ from nucliadb_models.resource import Resource
39
+
40
+ from .fields import augment_field
41
+ from .paragraphs import augment_paragraph
42
+ from .resources import augment_resource, augment_resource_deep
43
+
44
+
45
+ async def augment(
46
+ kbid: str,
47
+ augmentations: list[Augment],
48
+ *,
49
+ concurrency_control: asyncio.Semaphore | None = None,
50
+ ) -> Augmented:
51
+ """Process multiple augmentations concurrently and return the augmented content.
52
+
53
+ This is a heavy operation that can lead to many I/O operations with maindb
54
+ and/or blob storage. For improved performance, make sure this is called
55
+ inside the context of `nucliadb.search.search.cache` `request_caches`
56
+
57
+ """
58
+ augments: dict[str, Any] = {
59
+ "resources": {},
60
+ "resources.deep": {},
61
+ "fields": {},
62
+ "paragraphs": {},
63
+ }
64
+ for augmentation in augmentations:
65
+ if augmentation.from_ == "resources":
66
+ for id in augmentation.given:
67
+ if isinstance(id, str):
68
+ rid = id
69
+ elif isinstance(id, FieldId):
70
+ rid = id.rid
71
+ elif isinstance(id, ParagraphId):
72
+ rid = id.rid
73
+ else: # pragma: no cover
74
+ assert_never(id)
75
+
76
+ augments["resources"].setdefault(rid, []).extend(augmentation.select)
77
+
78
+ elif augmentation.from_ == "resources.deep":
79
+ for rid in augmentation.given:
80
+ opts = augments["resources.deep"].setdefault(rid, ResourceHydrationOptions())
81
+ opts.show.extend(augmentation.show)
82
+ opts.extracted.extend(augmentation.extracted)
83
+ opts.field_type_filter.extend(augmentation.field_type_filter)
84
+
85
+ elif augmentation.from_ == "fields":
86
+ unfiltered_field_ids: list[FieldId] = []
87
+ for id in augmentation.given:
88
+ if isinstance(id, str):
89
+ # augmenting resource fields
90
+ rid = id
91
+ all_field_ids = await datamanagers.atomic.resources.get_all_field_ids(
92
+ kbid=kbid, rid=rid, for_update=False
93
+ )
94
+ if all_field_ids is None:
95
+ continue
96
+
97
+ unfiltered_field_ids.extend(
98
+ FieldId.from_pb(
99
+ rid=rid, field_type=field_id_pb.field_type, key=field_id_pb.field
100
+ )
101
+ for field_id_pb in all_field_ids.fields
102
+ )
103
+
104
+ elif isinstance(id, FieldId):
105
+ unfiltered_field_ids.append(id)
106
+
107
+ elif isinstance(id, ParagraphId):
108
+ unfiltered_field_ids.append(id.field_id)
109
+
110
+ else: # pragma: no cover
111
+ assert_never(id)
112
+
113
+ if not augmentation.filter:
114
+ field_ids = unfiltered_field_ids
115
+ else:
116
+ field_ids = []
117
+ for field_id in unfiltered_field_ids:
118
+ for filter in augmentation.filter:
119
+ if isinstance(filter, nucliadb_models.filters.Field):
120
+ if filter.type == field_id.type and (
121
+ filter.name is None or filter.name == field_id.key
122
+ ):
123
+ field_ids.append(field_id)
124
+
125
+ elif isinstance(filter, nucliadb_models.filters.Generated):
126
+ # generated fields are always text fields starting with "da-"
127
+ if field_id.type == FIELD_TYPE_NAME_TO_STR[FieldTypeName.TEXT] and (
128
+ filter.da_task is None
129
+ or field_id.key.startswith(f"da-{filter.da_task}-")
130
+ ):
131
+ field_ids.append(field_id)
132
+
133
+ else: # pragma: no cover
134
+ assert_never(filter)
135
+
136
+ for field_id in field_ids:
137
+ augments["fields"].setdefault(field_id, []).extend(augmentation.select)
138
+
139
+ elif augmentation.from_ == "files" or augmentation.from_ == "conversations":
140
+ for id in augmentation.given:
141
+ if isinstance(id, FieldId):
142
+ field_id = id
143
+ elif isinstance(id, ParagraphId):
144
+ field_id = id.field_id
145
+ else: # pragma: no cover
146
+ assert_never(id)
147
+
148
+ augments["fields"].setdefault(field_id, []).extend(augmentation.select)
149
+
150
+ elif augmentation.from_ == "paragraphs":
151
+ for paragraph in augmentation.given:
152
+ select, metadata = augments["paragraphs"].setdefault(paragraph.id, ([], None))
153
+ select.extend(augmentation.select)
154
+ # we keep the first metadata object we see
155
+ metadata = metadata or paragraph.metadata
156
+ augments["paragraphs"][paragraph.id] = (select, metadata)
157
+
158
+ else: # pragma: no cover
159
+ assert_never(augmentation.from_)
160
+
161
+ ops = { # type: ignore[var-annotated]
162
+ "resources": [],
163
+ "resources.deep": [],
164
+ "fields": [],
165
+ "paragraphs": [],
166
+ }
167
+ for rid, select in augments["resources"].items():
168
+ task = asyncio.create_task(
169
+ limited_concurrency(
170
+ augment_resource( # type: ignore[arg-type]
171
+ kbid, rid, select
172
+ ),
173
+ max_ops=concurrency_control,
174
+ )
175
+ )
176
+ ops["resources"].append(task)
177
+
178
+ for rid, opts in augments["resources.deep"].items():
179
+ task = asyncio.create_task(
180
+ limited_concurrency(
181
+ augment_resource_deep( # type: ignore[arg-type]
182
+ kbid, rid, opts
183
+ ),
184
+ max_ops=concurrency_control,
185
+ )
186
+ )
187
+ ops["resources.deep"].append(task)
188
+
189
+ for field_id, select in augments["fields"].items():
190
+ task = asyncio.create_task(
191
+ limited_concurrency(
192
+ augment_field( # type: ignore[arg-type]
193
+ kbid, field_id, select
194
+ ),
195
+ max_ops=concurrency_control,
196
+ )
197
+ )
198
+ ops["fields"].append(task)
199
+
200
+ for paragraph_id, (select, metadata) in augments["paragraphs"].items():
201
+ task = asyncio.create_task(
202
+ limited_concurrency(
203
+ augment_paragraph( # type: ignore[arg-type]
204
+ kbid, paragraph_id, select, metadata
205
+ ),
206
+ max_ops=concurrency_control,
207
+ )
208
+ )
209
+ ops["paragraphs"].append(task)
210
+
211
+ results = await asyncio.gather(
212
+ *ops["resources"], *ops["resources.deep"], *ops["fields"], *ops["paragraphs"]
213
+ )
214
+
215
+ resources: list[AugmentedResource] = results[: len(ops["resources"])]
216
+ del results[: len(ops["resources"])]
217
+ resources_deep: list[Resource] = results[: len(ops["resources.deep"])]
218
+ del results[: len(ops["resources.deep"])]
219
+ fields: list[AugmentedField] = results[: len(ops["fields"])]
220
+ del results[: len(ops["fields"])]
221
+ paragraphs: list[AugmentedParagraph] = results[: len(ops["paragraphs"])]
222
+
223
+ return Augmented(
224
+ resources={resource.id: resource for resource in resources if resource is not None},
225
+ resources_deep={
226
+ resource_deep.id: resource_deep
227
+ for resource_deep in resources_deep
228
+ if resource_deep is not None
229
+ },
230
+ fields={field.id: field for field in fields if field is not None},
231
+ paragraphs={paragraph.id: paragraph for paragraph in paragraphs if paragraph is not None},
232
+ )