nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import AsyncGenerator, Optional
21
+ from collections.abc import AsyncGenerator
22
22
 
23
23
  from nucliadb.train.generators.utils import batchify
24
24
  from nucliadb_models.filters import FilterExpression
@@ -33,7 +33,7 @@ def image_classification_batch_generator(
33
33
  kbid: str,
34
34
  trainset: TrainSet,
35
35
  shard_replica_id: str,
36
- filter_expression: Optional[FilterExpression],
36
+ filter_expression: FilterExpression | None,
37
37
  ) -> AsyncGenerator[ImageClassificationBatch, None]:
38
38
  generator = generate_image_classification_payloads(kbid, trainset, shard_replica_id)
39
39
  batch_generator = batchify(generator, trainset.batch_size, ImageClassificationBatch)
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import AsyncGenerator, Optional
21
+ from collections.abc import AsyncGenerator
22
22
 
23
23
  from fastapi import HTTPException
24
24
  from nidx_protos.nodereader_pb2 import StreamRequest
@@ -38,7 +38,7 @@ def paragraph_classification_batch_generator(
38
38
  kbid: str,
39
39
  trainset: TrainSet,
40
40
  shard_replica_id: str,
41
- filter_expression: Optional[FilterExpression],
41
+ filter_expression: FilterExpression | None,
42
42
  ) -> AsyncGenerator[ParagraphClassificationBatch, None]:
43
43
  if len(trainset.filter.labels) != 1:
44
44
  raise HTTPException(
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import AsyncGenerator, Optional
21
+ from collections.abc import AsyncGenerator
22
22
 
23
23
  from nidx_protos.nodereader_pb2 import StreamRequest
24
24
 
@@ -38,7 +38,7 @@ def paragraph_streaming_batch_generator(
38
38
  kbid: str,
39
39
  trainset: TrainSet,
40
40
  shard_replica_id: str,
41
- filter_expression: Optional[FilterExpression],
41
+ filter_expression: FilterExpression | None,
42
42
  ) -> AsyncGenerator[ParagraphStreamingBatch, None]:
43
43
  generator = generate_paragraph_streaming_payloads(kbid, trainset, shard_replica_id)
44
44
  batch_generator = batchify(generator, trainset.batch_size, ParagraphStreamingBatch)
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import AsyncGenerator, Optional
21
+ from collections.abc import AsyncGenerator
22
22
 
23
23
  from nidx_protos.nodereader_pb2 import StreamRequest
24
24
 
@@ -47,7 +47,7 @@ def question_answer_batch_generator(
47
47
  kbid: str,
48
48
  trainset: TrainSet,
49
49
  shard_replica_id: str,
50
- filter_expression: Optional[FilterExpression],
50
+ filter_expression: FilterExpression | None,
51
51
  ) -> AsyncGenerator[QuestionAnswerStreamingBatch, None]:
52
52
  generator = generate_question_answer_streaming_payloads(kbid, trainset, shard_replica_id)
53
53
  batch_generator = batchify(generator, trainset.batch_size, QuestionAnswerStreamingBatch)
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import AsyncGenerator, Optional
21
+ from collections.abc import AsyncGenerator
22
22
 
23
23
  from fastapi import HTTPException
24
24
  from nidx_protos.nodereader_pb2 import StreamRequest
@@ -40,7 +40,7 @@ def sentence_classification_batch_generator(
40
40
  kbid: str,
41
41
  trainset: TrainSet,
42
42
  shard_replica_id: str,
43
- filter_expression: Optional[FilterExpression],
43
+ filter_expression: FilterExpression | None,
44
44
  ) -> AsyncGenerator[SentenceClassificationBatch, None]:
45
45
  if len(trainset.filter.labels) == 0:
46
46
  raise HTTPException(
@@ -116,10 +116,7 @@ async def get_sentences(kbid: str, result: str) -> list[str]:
116
116
  if split is not None:
117
117
  text = extracted_text.split_text[split]
118
118
  for paragraph in field_metadata.split_metadata[split].paragraphs:
119
- if paragraph.key == "":
120
- key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
121
- else:
122
- key = paragraph.key
119
+ key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
123
120
  if key == result:
124
121
  for sentence in paragraph.sentences:
125
122
  splitted_text = text[sentence.start : sentence.end]
@@ -127,10 +124,7 @@ async def get_sentences(kbid: str, result: str) -> list[str]:
127
124
  else:
128
125
  text = extracted_text.text
129
126
  for paragraph in field_metadata.metadata.paragraphs:
130
- if paragraph.key == "":
131
- key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
132
- else:
133
- key = paragraph.key
127
+ key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
134
128
  if key == result:
135
129
  for sentence in paragraph.sentences:
136
130
  splitted_text = text[sentence.start : sentence.end]
@@ -19,7 +19,8 @@
19
19
  #
20
20
 
21
21
  from collections import OrderedDict
22
- from typing import AsyncGenerator, Optional, cast
22
+ from collections.abc import AsyncGenerator
23
+ from typing import cast
23
24
 
24
25
  from nidx_protos.nodereader_pb2 import StreamFilter, StreamRequest
25
26
 
@@ -43,7 +44,7 @@ def token_classification_batch_generator(
43
44
  kbid: str,
44
45
  trainset: TrainSet,
45
46
  shard_replica_id: str,
46
- filter_expression: Optional[FilterExpression],
47
+ filter_expression: FilterExpression | None,
47
48
  ) -> AsyncGenerator[TokenClassificationBatch, None]:
48
49
  generator = generate_token_classification_payloads(kbid, trainset, shard_replica_id)
49
50
  batch_generator = batchify(generator, trainset.batch_size, TokenClassificationBatch)
@@ -18,7 +18,8 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Any, AsyncGenerator, AsyncIterator, Optional, Type
21
+ from collections.abc import AsyncGenerator, AsyncIterator
22
+ from typing import Any
22
23
 
23
24
  from nucliadb.common.cache import get_resource_cache
24
25
  from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB
@@ -30,16 +31,16 @@ from nucliadb.train.types import T
30
31
  from nucliadb_utils.utilities import get_storage
31
32
 
32
33
 
33
- async def get_resource_from_cache_or_db(kbid: str, uuid: str) -> Optional[ResourceORM]:
34
+ async def get_resource_from_cache_or_db(kbid: str, uuid: str) -> ResourceORM | None:
34
35
  resource_cache = get_resource_cache()
35
36
  if resource_cache is None:
36
- return await _get_resource_from_db(kbid, uuid)
37
37
  logger.warning("Resource cache is not set")
38
+ return await _get_resource_from_db(kbid, uuid)
38
39
 
39
40
  return await resource_cache.get(kbid, uuid)
40
41
 
41
42
 
42
- async def _get_resource_from_db(kbid: str, uuid: str) -> Optional[ResourceORM]:
43
+ async def _get_resource_from_db(kbid: str, uuid: str) -> ResourceORM | None:
43
44
  storage = await get_storage(service_name=SERVICE_NAME)
44
45
  async with get_driver().ro_transaction() as transaction:
45
46
  kb = KnowledgeBoxORM(transaction, storage, kbid)
@@ -81,7 +82,7 @@ async def get_paragraph(kbid: str, paragraph_id: str) -> str:
81
82
 
82
83
 
83
84
  async def batchify(
84
- producer: AsyncIterator[Any], size: int, batch_klass: Type[T]
85
+ producer: AsyncIterator[Any], size: int, batch_klass: type[T]
85
86
  ) -> AsyncGenerator[T, None]:
86
87
  # NOTE: we are supposing all protobuffers have a data field
87
88
  batch = []
nucliadb/train/nodes.py CHANGED
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import AsyncIterator, Optional
20
+ from collections.abc import AsyncIterator
21
21
 
22
22
  from nucliadb.common import datamanagers
23
23
  from nucliadb.common.cluster import manager
@@ -62,7 +62,7 @@ class TrainShardManager(manager.KBShardManager):
62
62
 
63
63
  return shard_object.nidx_shard_id
64
64
 
65
- async def get_kb_obj(self, txn: Transaction, kbid: str) -> Optional[KnowledgeBox]:
65
+ async def get_kb_obj(self, txn: Transaction, kbid: str) -> KnowledgeBox | None:
66
66
  if kbid is None:
67
67
  return None
68
68
 
@@ -72,7 +72,7 @@ class TrainShardManager(manager.KBShardManager):
72
72
  kbobj = KnowledgeBox(txn, self.storage, kbid)
73
73
  return kbobj
74
74
 
75
- async def get_kb_entities_manager(self, txn: Transaction, kbid: str) -> Optional[EntitiesManager]:
75
+ async def get_kb_entities_manager(self, txn: Transaction, kbid: str) -> EntitiesManager | None:
76
76
  kbobj = await self.get_kb_obj(txn, kbid)
77
77
  if kbobj is None:
78
78
  return None
@@ -19,7 +19,7 @@
19
19
  #
20
20
  from __future__ import annotations
21
21
 
22
- from typing import AsyncIterator, MutableMapping, Optional
22
+ from collections.abc import AsyncIterator, MutableMapping
23
23
 
24
24
  from nucliadb.common import datamanagers
25
25
  from nucliadb.ingest.orm.resource import Resource
@@ -69,9 +69,7 @@ async def iterate_sentences(
69
69
  # return any
70
70
  vectorset_id = None
71
71
  async with datamanagers.with_ro_transaction() as txn:
72
- async for vectorset_id, vs in datamanagers.vectorsets.iter(
73
- txn=txn, kbid=resource.kb.kbid
74
- ):
72
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(txn=txn, kbid=resource.kbid):
75
73
  break
76
74
  assert vectorset_id is not None, "All KBs must have at least a vectorset"
77
75
  vo = await field.get_vectors(vectorset_id, vs.storage_key_kind)
@@ -81,7 +79,7 @@ async def iterate_sentences(
81
79
  if fm is None:
82
80
  continue
83
81
 
84
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
82
+ field_metadatas: list[tuple[str | None, FieldMetadata]] = [(None, fm.metadata)]
85
83
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
86
84
  field_metadatas.append((subfield_metadata, splitted_metadata))
87
85
 
@@ -188,7 +186,7 @@ async def iterate_paragraphs(
188
186
  if fm is None:
189
187
  continue
190
188
 
191
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
189
+ field_metadatas: list[tuple[str | None, FieldMetadata]] = [(None, fm.metadata)]
192
190
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
193
191
  field_metadatas.append((subfield_metadata, splitted_metadata))
194
192
 
@@ -264,7 +262,7 @@ async def iterate_fields(
264
262
  if fm is None:
265
263
  continue
266
264
 
267
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
265
+ field_metadatas: list[tuple[str | None, FieldMetadata]] = [(None, fm.metadata)]
268
266
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
269
267
  field_metadatas.append((subfield_metadata, splitted_metadata))
270
268
 
@@ -319,7 +317,7 @@ async def generate_train_resource(
319
317
  if fm is None:
320
318
  continue
321
319
 
322
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
320
+ field_metadatas: list[tuple[str | None, FieldMetadata]] = [(None, fm.metadata)]
323
321
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
324
322
  field_metadatas.append((subfield_metadata, splitted_metadata))
325
323
 
@@ -17,17 +17,16 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  from nucliadb.ingest.settings import DriverSettings
23
22
 
24
23
 
25
24
  class Settings(DriverSettings):
26
25
  grpc_port: int = 8031
27
- train_grpc_address: Optional[str] = None
26
+ train_grpc_address: str | None = None
28
27
 
29
- nuclia_learning_url: Optional[str] = "https://nuclia.cloud/api/v1/learning/"
30
- nuclia_learning_apikey: Optional[str] = None
28
+ nuclia_learning_url: str | None = "https://nuclia.cloud/api/v1/learning/"
29
+ nuclia_learning_apikey: str | None = None
31
30
 
32
31
  internal_counter_api: str = "http://search.nuclia.svc.cluster.local:8030/api/v1/kb/{kbid}/counters"
33
32
 
nucliadb/train/types.py CHANGED
@@ -17,20 +17,20 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import TypeVar, Union
20
+ from typing import TypeVar
21
21
 
22
22
  from nucliadb_protos import dataset_pb2 as dpb
23
23
 
24
- TrainBatch = Union[
25
- dpb.FieldClassificationBatch,
26
- dpb.ImageClassificationBatch,
27
- dpb.ParagraphClassificationBatch,
28
- dpb.ParagraphStreamingBatch,
29
- dpb.QuestionAnswerStreamingBatch,
30
- dpb.SentenceClassificationBatch,
31
- dpb.TokenClassificationBatch,
32
- dpb.FieldStreamingBatch,
33
- ]
24
+ TrainBatch = (
25
+ dpb.FieldClassificationBatch
26
+ | dpb.ImageClassificationBatch
27
+ | dpb.ParagraphClassificationBatch
28
+ | dpb.ParagraphStreamingBatch
29
+ | dpb.QuestionAnswerStreamingBatch
30
+ | dpb.SentenceClassificationBatch
31
+ | dpb.TokenClassificationBatch
32
+ | dpb.FieldStreamingBatch
33
+ )
34
34
 
35
35
  T = TypeVar(
36
36
  "T",
nucliadb/train/upload.py CHANGED
@@ -20,8 +20,9 @@
20
20
  import argparse
21
21
  import asyncio
22
22
  import importlib.metadata
23
+ import inspect
23
24
  from asyncio import tasks
24
- from typing import Callable
25
+ from collections.abc import Callable
25
26
 
26
27
  from nucliadb.train.uploader import start_upload
27
28
  from nucliadb_telemetry import errors
@@ -89,7 +90,7 @@ def run() -> None:
89
90
  finally:
90
91
  try:
91
92
  for finalizer in finalizers:
92
- if asyncio.iscoroutinefunction(finalizer):
93
+ if inspect.iscoroutinefunction(finalizer):
93
94
  loop.run_until_complete(finalizer())
94
95
  else:
95
96
  finalizer()
@@ -17,7 +17,6 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  import aiohttp
23
22
 
@@ -97,7 +96,7 @@ class UploadServicer:
97
96
  response.status = GetLabelsResponse.Status.NOTFOUND
98
97
  return response
99
98
  response.kb.uuid = kbid
100
- labels: Optional[Labels] = await datamanagers.atomic.labelset.get_all(kbid=kbid)
99
+ labels: Labels | None = await datamanagers.atomic.labelset.get_all(kbid=kbid)
101
100
  if labels is not None:
102
101
  response.labels.CopyFrom(labels)
103
102
  return response
nucliadb/train/utils.py CHANGED
@@ -17,7 +17,6 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  from grpc import aio
23
22
  from grpc_health.v1 import health, health_pb2_grpc
@@ -37,7 +36,7 @@ from nucliadb_utils.utilities import (
37
36
  )
38
37
 
39
38
 
40
- async def start_train_grpc(service_name: Optional[str] = None):
39
+ async def start_train_grpc(service_name: str | None = None):
41
40
  actual_service = get_utility(Utility.TRAIN)
42
41
  if actual_service is not None:
43
42
  return
@@ -17,8 +17,8 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ from collections.abc import AsyncGenerator
20
21
  from datetime import datetime
21
- from typing import AsyncGenerator
22
22
  from uuid import uuid4
23
23
 
24
24
  from fastapi_versioning import version
@@ -66,6 +66,7 @@ from nucliadb_utils.authentication import requires_one
66
66
  summary="Start an export of a Knowledge Box",
67
67
  tags=["Knowledge Boxes"],
68
68
  response_model=CreateExportResponse,
69
+ include_in_schema=False,
69
70
  )
70
71
  @requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.WRITER])
71
72
  @version(1)
@@ -91,6 +92,7 @@ async def start_kb_export_endpoint(request: Request, kbid: str):
91
92
  tags=["Knowledge Boxes"],
92
93
  response_model=NewImportedKbResponse,
93
94
  openapi_extra={"x-hidden-operation": True},
95
+ include_in_schema=False,
94
96
  )
95
97
  @requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.WRITER])
96
98
  @version(1)
@@ -140,6 +142,7 @@ async def kb_create_and_import_endpoint(request: Request):
140
142
  summary="Start an import to a Knowledge Box",
141
143
  tags=["Knowledge Boxes"],
142
144
  response_model=CreateImportResponse,
145
+ include_in_schema=False,
143
146
  )
144
147
  @requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.WRITER])
145
148
  @version(1)
@@ -17,8 +17,9 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ from collections.abc import Callable
20
21
  from inspect import iscoroutinefunction
21
- from typing import TYPE_CHECKING, Annotated, Callable, Optional, Type, Union
22
+ from typing import TYPE_CHECKING, Annotated
22
23
 
23
24
  import pydantic
24
25
  from fastapi import HTTPException, Query, Response
@@ -72,12 +73,7 @@ if TYPE_CHECKING: # pragma: no cover
72
73
  else:
73
74
  FIELD_TYPE_NAME_TO_FIELD_TYPE_MAP: dict[models.FieldTypeName, int]
74
75
 
75
- FieldModelType = Union[
76
- models.TextField,
77
- models.LinkField,
78
- models.InputConversationField,
79
- models.FileField,
80
- ]
76
+ FieldModelType = models.TextField | models.LinkField | models.InputConversationField | models.FileField
81
77
 
82
78
  FIELD_TYPE_NAME_TO_FIELD_TYPE_MAP = {
83
79
  models.FieldTypeName.FILE: resources_pb2.FieldType.FILE,
@@ -249,9 +245,10 @@ async def parse_conversation_field_adapter(
249
245
  writer: BrokerMessage,
250
246
  toprocess: PushPayload,
251
247
  resource_classifications: ResourceClassifications,
248
+ replace_field: bool = False,
252
249
  ):
253
250
  return await parse_conversation_field(
254
- field_id, field_payload, writer, toprocess, kbid, rid, resource_classifications
251
+ field_id, field_payload, writer, toprocess, kbid, rid, resource_classifications, replace_field
255
252
  )
256
253
 
257
254
 
@@ -277,7 +274,7 @@ async def parse_file_field_adapter(
277
274
  )
278
275
 
279
276
 
280
- FIELD_PARSERS_MAP: dict[Type, Callable] = {
277
+ FIELD_PARSERS_MAP: dict[type, Callable] = {
281
278
  models.TextField: parse_text_field_adapter,
282
279
  models.LinkField: parse_link_field_adapter,
283
280
  models.InputConversationField: parse_conversation_field_adapter,
@@ -380,7 +377,9 @@ async def add_resource_field_conversation_rslug_prefix(
380
377
  field_id: FieldIdString,
381
378
  field_payload: models.InputConversationField,
382
379
  ) -> ResourceFieldAdded:
383
- return await add_field_to_resource_by_slug(request, kbid, rslug, field_id, field_payload)
380
+ return await add_field_to_resource_by_slug(
381
+ request, kbid, rslug, field_id, field_payload, replace_field=True
382
+ )
384
383
 
385
384
 
386
385
  @api.put(
@@ -399,7 +398,7 @@ async def add_resource_field_conversation_rid_prefix(
399
398
  field_id: FieldIdString,
400
399
  field_payload: models.InputConversationField,
401
400
  ) -> ResourceFieldAdded:
402
- return await add_field_to_resource(request, kbid, rid, field_id, field_payload)
401
+ return await add_field_to_resource(request, kbid, rid, field_id, field_payload, replace_field=True)
403
402
 
404
403
 
405
404
  @api.put(
@@ -466,7 +465,9 @@ async def append_messages_to_conversation_field_rslug_prefix(
466
465
  field = models.InputConversationField(messages=messages)
467
466
  except pydantic.ValidationError as e:
468
467
  raise HTTPException(status_code=422, detail=str(e))
469
- return await add_field_to_resource_by_slug(request, kbid, rslug, field_id, field)
468
+ return await add_field_to_resource_by_slug(
469
+ request, kbid, rslug, field_id, field, replace_field=False
470
+ )
470
471
 
471
472
 
472
473
  @api.put(
@@ -489,7 +490,7 @@ async def append_messages_to_conversation_field_rid_prefix(
489
490
  field = models.InputConversationField(messages=messages)
490
491
  except pydantic.ValidationError as e:
491
492
  raise HTTPException(status_code=422, detail=str(e))
492
- return await add_field_to_resource(request, kbid, rid, field_id, field)
493
+ return await add_field_to_resource(request, kbid, rid, field_id, field, replace_field=False)
493
494
 
494
495
 
495
496
  @api.delete(
@@ -545,7 +546,7 @@ async def reprocess_file_field(
545
546
  rid: str,
546
547
  field_id: FieldIdString,
547
548
  x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
548
- x_file_password: Annotated[Optional[str], X_FILE_PASSWORD] = None,
549
+ x_file_password: Annotated[str | None, X_FILE_PASSWORD] = None,
549
550
  reset_title: bool = Query(
550
551
  default=False,
551
552
  description="Reset the title of the resource so that the file or link computed titles are set after processing.",
@@ -17,10 +17,9 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- import asyncio
21
20
  from functools import partial
22
21
 
23
- from fastapi import HTTPException
22
+ from fastapi import BackgroundTasks, HTTPException
24
23
  from fastapi_versioning import version
25
24
  from starlette.requests import Request
26
25
 
@@ -36,10 +35,6 @@ from nucliadb.writer import logger
36
35
  from nucliadb.writer.api.utils import only_for_onprem
37
36
  from nucliadb.writer.api.v1.router import KB_PREFIX, KBS_PREFIX, api
38
37
  from nucliadb.writer.utilities import get_processing
39
- from nucliadb_models.external_index_providers import (
40
- ExternalIndexProviderType,
41
- PineconeServerlessCloud,
42
- )
43
38
  from nucliadb_models.resource import (
44
39
  KnowledgeBoxConfig,
45
40
  KnowledgeBoxObj,
@@ -118,20 +113,6 @@ async def create_kb(item: KnowledgeBoxConfig) -> tuple[str, str]:
118
113
  external_index_provider = knowledgebox_pb2.CreateExternalIndexProviderMetadata(
119
114
  type=knowledgebox_pb2.ExternalIndexProviderType.UNSET,
120
115
  )
121
- if (
122
- item.external_index_provider
123
- and item.external_index_provider.type == ExternalIndexProviderType.PINECONE
124
- ):
125
- pinecone_api_key = item.external_index_provider.api_key
126
- serverless_pb = to_pinecone_serverless_cloud_pb(item.external_index_provider.serverless_cloud)
127
- external_index_provider = knowledgebox_pb2.CreateExternalIndexProviderMetadata(
128
- type=knowledgebox_pb2.ExternalIndexProviderType.PINECONE,
129
- pinecone_config=knowledgebox_pb2.CreatePineconeConfig(
130
- api_key=pinecone_api_key,
131
- serverless_cloud=serverless_pb,
132
- ),
133
- )
134
-
135
116
  try:
136
117
  (kbid, slug) = await KnowledgeBox.create(
137
118
  driver,
@@ -165,8 +146,6 @@ async def create_kb(item: KnowledgeBoxConfig) -> tuple[str, str]:
165
146
  @requires(NucliaDBRoles.MANAGER)
166
147
  @version(1)
167
148
  async def update_kb(request: Request, kbid: str, item: KnowledgeBoxConfig) -> KnowledgeBoxObjID:
168
- driver = get_driver()
169
- config = None
170
149
  if (
171
150
  item.slug
172
151
  or item.title
@@ -174,29 +153,24 @@ async def update_kb(request: Request, kbid: str, item: KnowledgeBoxConfig) -> Kn
174
153
  or item.hidden_resources_enabled
175
154
  or item.hidden_resources_hide_on_creation
176
155
  ):
177
- config = knowledgebox_pb2.KnowledgeBoxConfig(
178
- slug=item.slug or "",
179
- title=item.title or "",
180
- description=item.description or "",
181
- hidden_resources_enabled=item.hidden_resources_enabled,
182
- hidden_resources_hide_on_creation=item.hidden_resources_hide_on_creation,
183
- )
184
- try:
185
- async with driver.rw_transaction() as txn:
156
+ try:
157
+ driver = get_driver()
186
158
  await KnowledgeBox.update(
187
- txn,
188
- uuid=kbid,
159
+ driver,
160
+ kbid=kbid,
189
161
  slug=item.slug,
190
- config=config,
162
+ title=item.title,
163
+ description=item.description,
164
+ hidden_resources_enabled=item.hidden_resources_enabled,
165
+ hidden_resources_hide_on_creation=item.hidden_resources_hide_on_creation,
191
166
  )
192
- await txn.commit()
193
- except datamanagers.exceptions.KnowledgeBoxNotFound:
194
- raise HTTPException(status_code=404, detail="Knowledge box does not exist")
195
- except Exception as exc:
196
- logger.exception("Could not update KB", exc_info=exc, extra={"kbid": kbid})
197
- raise HTTPException(status_code=500, detail="Error updating knowledge box")
198
- else:
199
- return KnowledgeBoxObjID(uuid=kbid)
167
+ except datamanagers.exceptions.KnowledgeBoxNotFound:
168
+ raise HTTPException(status_code=404, detail="Knowledge box does not exist")
169
+ except Exception as exc:
170
+ logger.exception("Could not update KB", exc_info=exc, extra={"kbid": kbid})
171
+ raise HTTPException(status_code=500, detail="Error updating knowledge box")
172
+
173
+ return KnowledgeBoxObjID(uuid=kbid)
200
174
 
201
175
 
202
176
  @only_for_onprem
@@ -209,7 +183,7 @@ async def update_kb(request: Request, kbid: str, item: KnowledgeBoxConfig) -> Kn
209
183
  )
210
184
  @requires(NucliaDBRoles.MANAGER)
211
185
  @version(1)
212
- async def delete_kb(request: Request, kbid: str) -> KnowledgeBoxObj:
186
+ async def delete_kb(request: Request, kbid: str, background: BackgroundTasks) -> KnowledgeBoxObj:
213
187
  driver = get_driver()
214
188
  try:
215
189
  await KnowledgeBox.delete(driver, kbid=kbid)
@@ -233,18 +207,6 @@ async def delete_kb(request: Request, kbid: str) -> KnowledgeBoxObj:
233
207
  # be nice and notify processing this KB is being deleted so we waste
234
208
  # resources
235
209
  processing = get_processing()
236
- asyncio.create_task(processing.delete_from_processing(kbid=kbid))
210
+ background.add_task(processing.delete_from_processing, kbid=kbid)
237
211
 
238
212
  return KnowledgeBoxObj(uuid=kbid)
239
-
240
-
241
- def to_pinecone_serverless_cloud_pb(
242
- serverless: PineconeServerlessCloud,
243
- ) -> knowledgebox_pb2.PineconeServerlessCloud.ValueType:
244
- return {
245
- PineconeServerlessCloud.AWS_EU_WEST_1: knowledgebox_pb2.PineconeServerlessCloud.AWS_EU_WEST_1,
246
- PineconeServerlessCloud.AWS_US_EAST_1: knowledgebox_pb2.PineconeServerlessCloud.AWS_US_EAST_1,
247
- PineconeServerlessCloud.AWS_US_WEST_2: knowledgebox_pb2.PineconeServerlessCloud.AWS_US_WEST_2,
248
- PineconeServerlessCloud.AZURE_EASTUS2: knowledgebox_pb2.PineconeServerlessCloud.AZURE_EASTUS2,
249
- PineconeServerlessCloud.GCP_US_CENTRAL1: knowledgebox_pb2.PineconeServerlessCloud.GCP_US_CENTRAL1,
250
- }[serverless]
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from fastapi import Request
20
+ from fastapi import Header, Request
21
21
  from fastapi_versioning import version
22
22
  from nuclia_models.config.proto import ExtractConfig, SplitConfiguration
23
23
 
@@ -55,10 +55,11 @@ async def set_configuration(
55
55
  @requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.WRITER])
56
56
  @version(1)
57
57
  async def patch_configuration(
58
- request: Request,
59
- kbid: str,
58
+ request: Request, kbid: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
60
59
  ):
61
- return await learning_config_proxy(request, "PATCH", f"/config/{kbid}")
60
+ return await learning_config_proxy(
61
+ request, "PATCH", f"/config/{kbid}", headers={"account-id": x_nucliadb_account}
62
+ )
62
63
 
63
64
 
64
65
  @api.post(