nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,130 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from time import time
22
+
23
+ from fastapi import Header, HTTPException, Request
24
+ from fastapi_versioning import version
25
+
26
+ from nucliadb.common.exceptions import InvalidQueryError
27
+ from nucliadb.common.external_index_providers.base import TextBlockMatch
28
+ from nucliadb.common.models_utils import to_proto
29
+ from nucliadb.search.api.v1.router import KB_PREFIX, api
30
+ from nucliadb.search.search.query_parser.parsers.retrieve import parse_retrieve
31
+ from nucliadb.search.search.retrieval import text_block_search
32
+ from nucliadb_models.resource import NucliaDBRoles
33
+ from nucliadb_models.retrieval import (
34
+ Metadata,
35
+ RetrievalMatch,
36
+ RetrievalRequest,
37
+ RetrievalResponse,
38
+ Scores,
39
+ )
40
+ from nucliadb_models.search import NucliaDBClientType
41
+ from nucliadb_utils.authentication import requires
42
+ from nucliadb_utils.utilities import get_audit
43
+
44
+
45
+ @api.post(
46
+ f"/{KB_PREFIX}/{{kbid}}/retrieve",
47
+ status_code=200,
48
+ description="Search text blocks on a Knowledge Box",
49
+ include_in_schema=False,
50
+ tags=["Search"],
51
+ )
52
+ @requires(NucliaDBRoles.READER)
53
+ @version(1)
54
+ async def _retrieve_endpoint(
55
+ request: Request,
56
+ kbid: str,
57
+ item: RetrievalRequest,
58
+ x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
59
+ x_nucliadb_user: str = Header(""),
60
+ x_forwarded_for: str = Header(""),
61
+ ) -> RetrievalResponse:
62
+ return await retrieve_endpoint(
63
+ kbid,
64
+ item,
65
+ x_ndb_client=x_ndb_client,
66
+ x_nucliadb_user=x_nucliadb_user,
67
+ x_forwarded_for=x_forwarded_for,
68
+ )
69
+
70
+
71
+ async def retrieve_endpoint(
72
+ kbid: str,
73
+ item: RetrievalRequest,
74
+ *,
75
+ x_ndb_client: NucliaDBClientType,
76
+ x_nucliadb_user: str,
77
+ x_forwarded_for: str,
78
+ ) -> RetrievalResponse:
79
+ audit = get_audit()
80
+ start_time = time()
81
+
82
+ try:
83
+ retrieval = await parse_retrieve(kbid, item)
84
+ except InvalidQueryError as err:
85
+ raise HTTPException(
86
+ status_code=422,
87
+ detail=str(err),
88
+ )
89
+
90
+ text_blocks, pb_query, _, _ = await text_block_search(kbid, retrieval)
91
+
92
+ # cut the top K, we may have more due to extra results used for rank fusion
93
+ text_blocks = text_blocks[: retrieval.top_k]
94
+
95
+ # convert to response models
96
+ matches = [text_block_match_to_retrieval_match(text_block) for text_block in text_blocks]
97
+
98
+ if audit is not None:
99
+ retrieval_time = time() - start_time
100
+ audit.retrieve(
101
+ kbid,
102
+ x_nucliadb_user,
103
+ to_proto.client_type(x_ndb_client),
104
+ x_forwarded_for,
105
+ retrieval_time,
106
+ # TODO(decoupled-ask): add interesting things to audit
107
+ )
108
+
109
+ return RetrievalResponse(matches=matches)
110
+
111
+
112
+ def text_block_match_to_retrieval_match(item: TextBlockMatch) -> RetrievalMatch:
113
+ return RetrievalMatch(
114
+ id=item.paragraph_id.full(),
115
+ score=Scores(
116
+ value=item.current_score.score,
117
+ source=item.current_score.source,
118
+ type=item.current_score.type,
119
+ history=item.scores,
120
+ ),
121
+ metadata=Metadata(
122
+ field_labels=item.field_labels,
123
+ paragraph_labels=item.paragraph_labels,
124
+ is_an_image=item.is_an_image,
125
+ is_a_table=item.is_a_table,
126
+ source_file=item.representation_file,
127
+ page=item.position.page_number,
128
+ in_page_with_visual=item.page_with_visual,
129
+ ),
130
+ )
@@ -19,7 +19,6 @@
19
19
  #
20
20
  import json
21
21
  from time import time
22
- from typing import Optional, Union
23
22
 
24
23
  from fastapi import Body, Header, Query, Request, Response
25
24
  from fastapi.openapi.models import Example
@@ -37,7 +36,10 @@ from nucliadb.search.requesters.utils import Method, nidx_query
37
36
  from nucliadb.search.search import cache
38
37
  from nucliadb.search.search.merge import merge_results
39
38
  from nucliadb.search.search.query_parser.parsers.search import parse_search
40
- from nucliadb.search.search.query_parser.parsers.unit_retrieval import legacy_convert_retrieval_to_proto
39
+ from nucliadb.search.search.query_parser.parsers.unit_retrieval import (
40
+ convert_retrieval_to_proto,
41
+ is_incomplete,
42
+ )
41
43
  from nucliadb.search.search.utils import (
42
44
  min_score_from_query_params,
43
45
  )
@@ -65,7 +67,7 @@ from nucliadb_utils.utilities import get_audit
65
67
  SEARCH_EXAMPLES = {
66
68
  "filtering_by_icon": Example(
67
69
  summary="Search for pdf documents where the text 'Noam Chomsky' appears",
68
- description="For a complete list of filters, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets", # noqa
70
+ description="For a complete list of filters, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets",
69
71
  value={
70
72
  "query": "Noam Chomsky",
71
73
  "filters": ["/icon/application/pdf"],
@@ -74,7 +76,7 @@ SEARCH_EXAMPLES = {
74
76
  ),
75
77
  "get_language_counts": Example(
76
78
  summary="Get the number of documents for each language",
77
- description="For a complete list of facets, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets", # noqa
79
+ description="For a complete list of facets, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets",
78
80
  value={
79
81
  "page_size": 0,
80
82
  "faceted": ["/s/p"],
@@ -88,7 +90,7 @@ SEARCH_EXAMPLES = {
88
90
  f"/{KB_PREFIX}/{{kbid}}/search",
89
91
  status_code=200,
90
92
  summary="Search Knowledge Box",
91
- description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`", # noqa: E501
93
+ description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`",
92
94
  response_model=KnowledgeboxSearchResults,
93
95
  response_model_exclude_unset=True,
94
96
  tags=["Search"],
@@ -100,37 +102,35 @@ async def search_knowledgebox(
100
102
  response: Response,
101
103
  kbid: str,
102
104
  query: str = fastapi_query(SearchParamDefaults.query),
103
- filter_expression: Optional[str] = fastapi_query(SearchParamDefaults.filter_expression),
105
+ filter_expression: str | None = fastapi_query(SearchParamDefaults.filter_expression),
104
106
  fields: list[str] = fastapi_query(SearchParamDefaults.fields),
105
107
  filters: list[str] = fastapi_query(SearchParamDefaults.filters),
106
108
  faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
107
109
  sort_field: SortField = fastapi_query(SearchParamDefaults.sort_field),
108
- sort_limit: Optional[int] = fastapi_query(SearchParamDefaults.sort_limit),
109
110
  sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
110
111
  top_k: int = fastapi_query(SearchParamDefaults.top_k),
111
- min_score: Optional[float] = Query(
112
+ offset: int = fastapi_query(SearchParamDefaults.offset),
113
+ min_score: float | None = Query(
112
114
  default=None,
113
- description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
115
+ description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
114
116
  deprecated=True,
115
117
  ),
116
- min_score_semantic: Optional[float] = Query(
118
+ min_score_semantic: float | None = Query(
117
119
  default=None,
118
- description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
120
+ description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
119
121
  ),
120
122
  min_score_bm25: float = Query(
121
123
  default=0,
122
124
  description="Minimum bm25 score to filter paragraph and document index results",
123
125
  ge=0,
124
126
  ),
125
- vectorset: Optional[str] = fastapi_query(SearchParamDefaults.vectorset),
126
- range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
127
- range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
128
- range_modification_start: Optional[DateTime] = fastapi_query(
127
+ vectorset: str | None = fastapi_query(SearchParamDefaults.vectorset),
128
+ range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
129
+ range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
130
+ range_modification_start: DateTime | None = fastapi_query(
129
131
  SearchParamDefaults.range_modification_start
130
132
  ),
131
- range_modification_end: Optional[DateTime] = fastapi_query(
132
- SearchParamDefaults.range_modification_end
133
- ),
133
+ range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
134
134
  features: list[SearchOptions] = fastapi_query(
135
135
  SearchParamDefaults.search_features,
136
136
  default=[
@@ -148,13 +148,12 @@ async def search_knowledgebox(
148
148
  extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
149
149
  with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
150
150
  with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
151
- autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
152
151
  security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
153
152
  show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
154
153
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
155
154
  x_nucliadb_user: str = Header(""),
156
155
  x_forwarded_for: str = Header(""),
157
- ) -> Union[KnowledgeboxSearchResults, HTTPClientError]:
156
+ ) -> KnowledgeboxSearchResults | HTTPClientError:
158
157
  try:
159
158
  expr = FilterExpression.model_validate_json(filter_expression) if filter_expression else None
160
159
 
@@ -167,11 +166,7 @@ async def search_knowledgebox(
167
166
  fields=fields,
168
167
  filters=filters,
169
168
  faceted=faceted,
170
- sort=(
171
- SortOptions(field=sort_field, limit=sort_limit, order=sort_order)
172
- if sort_field is not None
173
- else None
174
- ),
169
+ sort=(SortOptions(field=sort_field, order=sort_order) if sort_field is not None else None),
175
170
  top_k=top_k,
176
171
  min_score=min_score_from_query_params(min_score_bm25, min_score_semantic, min_score),
177
172
  vectorset=vectorset,
@@ -187,9 +182,9 @@ async def search_knowledgebox(
187
182
  extracted=extracted,
188
183
  with_duplicates=with_duplicates,
189
184
  with_synonyms=with_synonyms,
190
- autofilter=autofilter,
191
185
  security=security,
192
186
  show_hidden=show_hidden,
187
+ offset=offset,
193
188
  )
194
189
  except ValidationError as exc:
195
190
  detail = json.loads(exc.json())
@@ -201,7 +196,7 @@ async def search_knowledgebox(
201
196
  f"/{KB_PREFIX}/{{kbid}}/search",
202
197
  status_code=200,
203
198
  summary="Search Knowledge Box",
204
- description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`", # noqa: E501
199
+ description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`",
205
200
  response_model=KnowledgeboxSearchResults,
206
201
  response_model_exclude_unset=True,
207
202
  tags=["Search"],
@@ -216,7 +211,7 @@ async def search_post_knowledgebox(
216
211
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
217
212
  x_nucliadb_user: str = Header(""),
218
213
  x_forwarded_for: str = Header(""),
219
- ) -> Union[KnowledgeboxSearchResults, HTTPClientError]:
214
+ ) -> KnowledgeboxSearchResults | HTTPClientError:
220
215
  return await _search_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
221
216
 
222
217
 
@@ -228,7 +223,7 @@ async def _search_endpoint(
228
223
  x_nucliadb_user: str,
229
224
  x_forwarded_for: str,
230
225
  **kwargs,
231
- ) -> Union[KnowledgeboxSearchResults, HTTPClientError]:
226
+ ) -> KnowledgeboxSearchResults | HTTPClientError:
232
227
  try:
233
228
  with cache.request_caches():
234
229
  results, incomplete = await search(
@@ -256,13 +251,14 @@ async def search(
256
251
  x_nucliadb_user: str,
257
252
  x_forwarded_for: str,
258
253
  do_audit: bool = True,
259
- with_status: Optional[ResourceProcessingStatus] = None,
254
+ with_status: ResourceProcessingStatus | None = None,
260
255
  ) -> tuple[KnowledgeboxSearchResults, bool]:
261
256
  audit = get_audit()
262
257
  start_time = time()
263
258
 
264
259
  parsed = await parse_search(kbid, item)
265
- pb_query, incomplete_results, autofilters, _ = await legacy_convert_retrieval_to_proto(parsed)
260
+ incomplete_results = is_incomplete(parsed.retrieval)
261
+ pb_query = convert_retrieval_to_proto(parsed.retrieval)
266
262
 
267
263
  # We need to query all nodes
268
264
  results, queried_shards = await nidx_query(kbid, Method.SEARCH, pb_query)
@@ -276,6 +272,7 @@ async def search(
276
272
  field_type_filter=item.field_type_filter,
277
273
  extracted=item.extracted,
278
274
  highlight=item.highlight,
275
+ offset=item.offset,
279
276
  )
280
277
 
281
278
  if audit is not None and do_audit:
@@ -290,5 +287,4 @@ async def search(
290
287
  )
291
288
 
292
289
  search_results.shards = queried_shards
293
- search_results.autofilters = autofilters
294
290
  return search_results, incomplete_results
@@ -19,7 +19,6 @@
19
19
  #
20
20
  import json
21
21
  from datetime import datetime
22
- from typing import Optional, Union
23
22
 
24
23
  from fastapi import Header, Request, Response
25
24
  from fastapi_versioning import version
@@ -64,20 +63,18 @@ async def suggest_knowledgebox(
64
63
  response: Response,
65
64
  kbid: str,
66
65
  query: str = fastapi_query(SearchParamDefaults.suggest_query),
67
- filter_expression: Optional[str] = fastapi_query(
66
+ filter_expression: str | None = fastapi_query(
68
67
  SearchParamDefaults.filter_expression, include_in_schema=False
69
68
  ),
70
69
  fields: list[str] = fastapi_query(SearchParamDefaults.fields),
71
70
  filters: list[str] = fastapi_query(SearchParamDefaults.filters),
72
71
  faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
73
- range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
74
- range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
75
- range_modification_start: Optional[DateTime] = fastapi_query(
72
+ range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
73
+ range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
74
+ range_modification_start: DateTime | None = fastapi_query(
76
75
  SearchParamDefaults.range_modification_start
77
76
  ),
78
- range_modification_end: Optional[DateTime] = fastapi_query(
79
- SearchParamDefaults.range_modification_end
80
- ),
77
+ range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
81
78
  features: list[SuggestOptions] = fastapi_query(SearchParamDefaults.suggest_features),
82
79
  show: list[ResourceProperties] = fastapi_query(SearchParamDefaults.show),
83
80
  field_type_filter: list[FieldTypeName] = fastapi_query(
@@ -89,7 +86,7 @@ async def suggest_knowledgebox(
89
86
  debug: bool = fastapi_query(SearchParamDefaults.debug),
90
87
  highlight: bool = fastapi_query(SearchParamDefaults.highlight),
91
88
  show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
92
- ) -> Union[KnowledgeboxSuggestResults, HTTPClientError]:
89
+ ) -> KnowledgeboxSuggestResults | HTTPClientError:
93
90
  try:
94
91
  expr = FilterExpression.model_validate_json(filter_expression) if filter_expression else None
95
92
 
@@ -126,14 +123,14 @@ async def suggest(
126
123
  response,
127
124
  kbid: str,
128
125
  query: str,
129
- filter_expression: Optional[FilterExpression],
126
+ filter_expression: FilterExpression | None,
130
127
  fields: list[str],
131
128
  filters: list[str],
132
129
  faceted: list[str],
133
- range_creation_start: Optional[datetime],
134
- range_creation_end: Optional[datetime],
135
- range_modification_start: Optional[datetime],
136
- range_modification_end: Optional[datetime],
130
+ range_creation_start: datetime | None,
131
+ range_creation_end: datetime | None,
132
+ range_modification_start: datetime | None,
133
+ range_modification_end: datetime | None,
137
134
  features: list[SuggestOptions],
138
135
  show: list[ResourceProperties],
139
136
  field_type_filter: list[FieldTypeName],
@@ -17,7 +17,6 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Union
21
20
 
22
21
  from fastapi import Header, Request
23
22
  from fastapi_versioning import version
@@ -48,7 +47,7 @@ async def summarize_endpoint(
48
47
  kbid: str,
49
48
  item: SummarizeRequest,
50
49
  x_show_consumption: bool = Header(default=False),
51
- ) -> Union[SummarizedResponse, HTTPClientError]:
50
+ ) -> SummarizedResponse | HTTPClientError:
52
51
  try:
53
52
  return await summarize(
54
53
  kbid=kbid,
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Any, Optional
20
+ from typing import Any
21
21
 
22
22
  from fastapi import Query
23
23
 
@@ -26,7 +26,7 @@ from nucliadb_models.search import ParamDefault
26
26
  _NOT_SET = object()
27
27
 
28
28
 
29
- def fastapi_query(param: ParamDefault, default: Optional[Any] = _NOT_SET, **kw) -> Query: # type: ignore
29
+ def fastapi_query(param: ParamDefault, default: Any | None = _NOT_SET, **kw) -> Query: # type: ignore
30
30
  # Be able to override default value
31
31
  if default is _NOT_SET:
32
32
  default_value = param.default
nucliadb/search/app.py CHANGED
@@ -26,7 +26,7 @@ from starlette.middleware.authentication import AuthenticationMiddleware
26
26
  from starlette.requests import ClientDisconnect, Request
27
27
  from starlette.responses import HTMLResponse
28
28
 
29
- from nucliadb.middleware import ProcessTimeHeaderMiddleware
29
+ from nucliadb.middleware import ClientErrorPayloadLoggerMiddleware, ProcessTimeHeaderMiddleware
30
30
  from nucliadb.search import API_PREFIX
31
31
  from nucliadb.search.api.v1.router import api as api_v1
32
32
  from nucliadb.search.lifecycle import lifespan
@@ -47,6 +47,7 @@ middleware.extend(
47
47
  [
48
48
  Middleware(AuthenticationMiddleware, backend=NucliaCloudAuthenticationBackend()),
49
49
  Middleware(AuditMiddleware, audit_utility_getter=get_audit),
50
+ Middleware(ClientErrorPayloadLoggerMiddleware),
50
51
  ]
51
52
  )
52
53
 
@@ -58,7 +59,6 @@ errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
58
59
 
59
60
  fastapi_settings = dict(
60
61
  debug=running_settings.debug,
61
- middleware=middleware,
62
62
  lifespan=lifespan,
63
63
  exception_handlers={
64
64
  Exception: global_exception_handler,
@@ -78,6 +78,7 @@ application = VersionedFastAPI(
78
78
  prefix_format=f"/{API_PREFIX}/v{{major}}",
79
79
  default_version=(1, 0),
80
80
  enable_latest=False,
81
+ middleware=middleware,
81
82
  kwargs=fastapi_settings,
82
83
  )
83
84
 
@@ -0,0 +1,21 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from . import fields, paragraphs, resources # noqa: F401
21
+ from .augmentor import augment # noqa: F401