nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import functools
21
- from typing import AsyncGenerator, AsyncIterator, Callable, Optional
21
+ from collections.abc import AsyncGenerator, AsyncIterator, Callable
22
22
 
23
23
  import backoff
24
24
  from google.protobuf.message import DecodeError as ProtobufDecodeError
@@ -35,6 +35,7 @@ from nucliadb.export_import.exceptions import (
35
35
  )
36
36
  from nucliadb.export_import.models import ExportedItemType, ExportItem, Metadata
37
37
  from nucliadb.ingest.orm.broker_message import generate_broker_message
38
+ from nucliadb.ingest.orm.resource import Resource
38
39
  from nucliadb_models.configuration import SearchConfiguration
39
40
  from nucliadb_models.export_import import Status
40
41
  from nucliadb_protos import knowledgebox_pb2 as kb_pb2
@@ -171,14 +172,6 @@ async def import_binary(
171
172
  )
172
173
 
173
174
 
174
- async def set_entities_groups(
175
- context: ApplicationContext, kbid: str, entities_groups: kb_pb2.EntitiesGroups
176
- ) -> None:
177
- async with datamanagers.with_transaction() as txn:
178
- await datamanagers.entities.set_entities_groups(txn, kbid=kbid, entities_groups=entities_groups)
179
- await txn.commit()
180
-
181
-
182
175
  async def set_synonyms(context: ApplicationContext, kbid: str, synonyms: kb_pb2.Synonyms) -> None:
183
176
  async with datamanagers.with_transaction() as txn:
184
177
  await datamanagers.synonyms.set(txn, kbid=kbid, synonyms=synonyms)
@@ -207,9 +200,9 @@ async def iter_kb_resource_uuids(context: ApplicationContext, kbid: str) -> Asyn
207
200
 
208
201
  async def get_broker_message(
209
202
  context: ApplicationContext, kbid: str, rid: str
210
- ) -> Optional[writer_pb2.BrokerMessage]:
203
+ ) -> writer_pb2.BrokerMessage | None:
211
204
  async with datamanagers.with_ro_transaction() as txn:
212
- resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
205
+ resource = await Resource.get(txn, kbid=kbid, rid=rid)
213
206
  if resource is None:
214
207
  return None
215
208
  resource.disable_vectors = False
@@ -284,11 +277,6 @@ async def download_binary(
284
277
  assert downloaded_bytes == cf.size, "Downloaded bytes do not match the expected size"
285
278
 
286
279
 
287
- async def get_entities(context: ApplicationContext, kbid: str) -> kb_pb2.EntitiesGroups:
288
- async with datamanagers.with_ro_transaction() as txn:
289
- return await datamanagers.entities.get_entities_groups(txn, kbid=kbid)
290
-
291
-
292
280
  async def get_labels(context: ApplicationContext, kbid: str) -> kb_pb2.Labels:
293
281
  async with datamanagers.with_ro_transaction() as txn:
294
282
  return await datamanagers.labels.get_labels(txn, kbid=kbid)
@@ -434,7 +422,7 @@ class ExportStreamReader:
434
422
 
435
423
  async def maybe_read_learning_config(
436
424
  self,
437
- ) -> tuple[Optional[learning_proxy.LearningConfiguration], bytes]:
425
+ ) -> tuple[learning_proxy.LearningConfiguration | None, bytes]:
438
426
  """
439
427
  Tries to read a learning config from the beginning of the stream.
440
428
  Returs the learning config if found. It also returns any leftover bytes that
@@ -533,7 +521,7 @@ class TaskRetryHandler:
533
521
 
534
522
  async def get_learning_config(
535
523
  kbid: str,
536
- ) -> Optional[learning_proxy.LearningConfiguration]:
524
+ ) -> learning_proxy.LearningConfiguration | None:
537
525
  return await learning_proxy.get_configuration(kbid)
538
526
 
539
527
 
nucliadb/health.py CHANGED
@@ -19,7 +19,7 @@
19
19
  #
20
20
  import asyncio
21
21
  import logging
22
- from typing import Awaitable, Callable, Optional
22
+ from collections.abc import Awaitable, Callable
23
23
 
24
24
  from grpc import aio
25
25
  from grpc_health.v1 import health, health_pb2, health_pb2_grpc
@@ -41,7 +41,7 @@ def nats_manager_healthy() -> bool:
41
41
 
42
42
 
43
43
  def pubsub_check() -> bool:
44
- driver: Optional[PubSubDriver] = get_utility(Utility.PUBSUB)
44
+ driver: PubSubDriver | None = get_utility(Utility.PUBSUB)
45
45
  if driver is None:
46
46
  return True
47
47
  if isinstance(driver, NatsPubsub):
nucliadb/ingest/app.py CHANGED
@@ -19,7 +19,7 @@
19
19
  #
20
20
  import asyncio
21
21
  import importlib.metadata
22
- from typing import Awaitable, Callable
22
+ from collections.abc import Awaitable, Callable
23
23
 
24
24
  from nucliadb import health
25
25
  from nucliadb.backups.tasks import initialize_consumers as initialize_backup_consumers
@@ -96,7 +96,7 @@ async def initialize_grpc(): # pragma: no cover
96
96
  finalizers = await initialize()
97
97
  grpc_finalizer = await start_grpc(SERVICE_NAME)
98
98
 
99
- return [grpc_finalizer] + finalizers
99
+ return [grpc_finalizer, *finalizers]
100
100
 
101
101
 
102
102
  async def initialize_pull_workers() -> list[Callable[[], Awaitable[None]]]:
@@ -114,14 +114,14 @@ async def main_consumer(): # pragma: no cover
114
114
 
115
115
  ingest_consumers = await consumer_service.start_ingest_consumers(SERVICE_NAME)
116
116
 
117
- await run_until_exit([grpc_health_finalizer, ingest_consumers, metrics_server.shutdown] + finalizers)
117
+ await run_until_exit([grpc_health_finalizer, ingest_consumers, metrics_server.shutdown, *finalizers])
118
118
 
119
119
 
120
120
  async def main_orm_grpc(): # pragma: no cover
121
121
  finalizers = await initialize()
122
122
  grpc_finalizer = await start_grpc(SERVICE_NAME)
123
123
  metrics_server = await serve_metrics()
124
- await run_until_exit([grpc_finalizer, metrics_server.shutdown] + finalizers)
124
+ await run_until_exit([grpc_finalizer, metrics_server.shutdown, *finalizers])
125
125
 
126
126
 
127
127
  async def main_ingest_processed_consumer(): # pragma: no cover
@@ -134,7 +134,7 @@ async def main_ingest_processed_consumer(): # pragma: no cover
134
134
  consumer = await consumer_service.start_ingest_processed_consumer_v2(SERVICE_NAME)
135
135
 
136
136
  await run_until_exit(
137
- [grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine] + finalizers
137
+ [grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine, *finalizers]
138
138
  )
139
139
 
140
140
 
@@ -158,8 +158,8 @@ async def main_subscriber_workers(): # pragma: no cover
158
158
  backup_consumers_finalizers = await initialize_backup_consumers(context)
159
159
 
160
160
  await run_until_exit(
161
- backup_consumers_finalizers
162
- + [
161
+ [
162
+ *backup_consumers_finalizers,
163
163
  imports_consumer.finalize,
164
164
  exports_consumer.finalize,
165
165
  stop_ingest_utility,
@@ -169,8 +169,8 @@ async def main_subscriber_workers(): # pragma: no cover
169
169
  grpc_health_finalizer,
170
170
  metrics_server.shutdown,
171
171
  context.finalize,
172
+ *finalizers,
172
173
  ]
173
- + finalizers
174
174
  )
175
175
 
176
176
 
@@ -20,12 +20,10 @@
20
20
  import asyncio
21
21
  import logging
22
22
  import time
23
- from typing import Optional, Union
24
23
 
25
24
  import backoff
26
25
  import nats
27
26
  import nats.js.api
28
- import nats.js.errors
29
27
  from nats.aio.client import Msg
30
28
  from nats.js import JetStreamContext
31
29
 
@@ -74,8 +72,8 @@ class IngestConsumer:
74
72
  partition: str,
75
73
  storage: Storage,
76
74
  nats_connection_manager: NatsConnectionManager,
77
- pubsub: Optional[PubSubDriver] = None,
78
- lock: Optional[Union[asyncio.Lock, asyncio.Semaphore]] = None,
75
+ pubsub: PubSubDriver | None = None,
76
+ lock: asyncio.Lock | asyncio.Semaphore | None = None,
79
77
  ):
80
78
  self.driver = driver
81
79
  self.partition = partition
@@ -85,9 +83,9 @@ class IngestConsumer:
85
83
 
86
84
  self.lock = lock or asyncio.Lock()
87
85
  self.processor = Processor(driver, storage, pubsub, partition)
88
- self.subscription: Optional[JetStreamContext.PullSubscription] = None
86
+ self.subscription: JetStreamContext.PullSubscription | None = None
89
87
 
90
- async def ack_message(self, msg: Msg, kbid: Optional[str] = None):
88
+ async def ack_message(self, msg: Msg, kbid: str | None = None):
91
89
  await msg.ack()
92
90
 
93
91
  async def initialize(self):
@@ -162,7 +160,7 @@ class IngestConsumer:
162
160
  async def subscription_worker(self, msg: Msg):
163
161
  context.clear_context()
164
162
 
165
- kbid: Optional[str] = None
163
+ kbid: str | None = None
166
164
  subject = msg.subject
167
165
  reply = msg.reply
168
166
  seqid = int(reply.split(".")[5])
@@ -238,7 +236,7 @@ class IngestConsumer:
238
236
  logger.info(
239
237
  f"An error happend while processing a message from {message_source}. "
240
238
  f"A copy of the message has been stored on {self.processor.storage.deadletter_bucket}. "
241
- f"Check sentry for more details: {str(e)}"
239
+ f"Check sentry for more details: {e!s}"
242
240
  )
243
241
  await self.ack_message(msg, kbid)
244
242
  logger.info("Message acked because of deadletter", extra={"seqid": seqid})
@@ -250,7 +248,7 @@ class IngestConsumer:
250
248
  logger.info(
251
249
  f"An error happend while processing a message from {message_source}. "
252
250
  f"This message has been dropped and won't be retried again"
253
- f"Check sentry for more details: {str(e)}"
251
+ f"Check sentry for more details: {e!s}"
254
252
  )
255
253
  await self.ack_message(msg, kbid)
256
254
  logger.info("Message acked because of drop", extra={"seqid": seqid})
@@ -260,7 +258,7 @@ class IngestConsumer:
260
258
  logger.exception(
261
259
  f"An error happend while processing a message from {message_source}. "
262
260
  "Message has not been ACKd and will be retried. "
263
- f"Check sentry for more details: {str(e)}"
261
+ f"Check sentry for more details: {e!s}"
264
262
  )
265
263
  await msg.nak()
266
264
  logger.info("Message nacked because of unhandled error", extra={"seqid": seqid})
@@ -21,7 +21,6 @@ import asyncio
21
21
  import base64
22
22
  import time
23
23
  from contextlib import contextmanager
24
- from typing import Optional
25
24
 
26
25
  from aiohttp.client_exceptions import ClientConnectorError
27
26
  from opentelemetry import trace
@@ -31,10 +30,10 @@ from opentelemetry.trace import (
31
30
  Link,
32
31
  )
33
32
 
33
+ from nucliadb.common.http_clients.exceptions import ServiceUnavailableException
34
34
  from nucliadb.common.http_clients.processing import (
35
35
  ProcessingHTTPClient,
36
36
  ProcessingPullMessageProgressUpdater,
37
- get_nua_api_id,
38
37
  )
39
38
  from nucliadb.common.maindb.driver import Driver
40
39
  from nucliadb.ingest import SERVICE_NAME, logger, logger_activity
@@ -95,7 +94,7 @@ class PullV2Worker:
95
94
  driver: Driver,
96
95
  storage: Storage,
97
96
  pull_time_error_backoff: int,
98
- pubsub: Optional[PubSubDriver] = None,
97
+ pubsub: PubSubDriver | None = None,
99
98
  pull_time_empty_backoff: float = 5.0,
100
99
  pull_api_timeout: int = 60,
101
100
  ):
@@ -141,12 +140,9 @@ class PullV2Worker:
141
140
  data = None
142
141
  if nuclia_settings.nuclia_service_account is not None:
143
142
  headers["X-STF-NUAKEY"] = f"Bearer {nuclia_settings.nuclia_service_account}"
144
- # parse jwt sub to get pull type id
145
- try:
146
- get_nua_api_id()
147
- except Exception as exc:
143
+ if nuclia_settings.nuclia_service_account is None:
148
144
  logger.exception("Could not read NUA API Key. Can not start pull worker")
149
- raise ReallyStopPulling() from exc
145
+ raise ReallyStopPulling()
150
146
 
151
147
  ack_tokens = []
152
148
  async with ProcessingHTTPClient() as processing_http_client:
@@ -209,6 +205,12 @@ class PullV2Worker:
209
205
  payload_length = len(base64.b64decode(data.payload))
210
206
  logger.error(f"Message too big for transaction: {payload_length}")
211
207
  raise e
208
+
209
+ except ServiceUnavailableException as ex:
210
+ logger.warning(f"Processing api is unavailable, will retry shortly: {ex}")
211
+ await processing_http_client.reset_session()
212
+ await asyncio.sleep(self.pull_time_error_backoff)
213
+
212
214
  except Exception:
213
215
  logger.exception("Unhandled error pulling messages from processing")
214
216
  await asyncio.sleep(self.pull_time_error_backoff)
@@ -19,24 +19,21 @@
19
19
  #
20
20
  import asyncio
21
21
  import sys
22
+ from collections.abc import Awaitable, Callable
22
23
  from functools import partial
23
- from typing import Awaitable, Callable, Optional
24
24
 
25
- from nucliadb.common.back_pressure.materializer import BackPressureMaterializer
26
- from nucliadb.common.back_pressure.settings import settings as back_pressure_settings
27
25
  from nucliadb.common.maindb.utils import setup_driver
28
26
  from nucliadb.ingest import SERVICE_NAME, logger
29
27
  from nucliadb.ingest.consumer.consumer import IngestConsumer
30
28
  from nucliadb.ingest.consumer.pull import PullV2Worker
31
29
  from nucliadb.ingest.settings import settings
32
30
  from nucliadb_utils.exceptions import ConfigurationError
33
- from nucliadb_utils.settings import indexing_settings, transaction_settings
31
+ from nucliadb_utils.settings import transaction_settings
34
32
  from nucliadb_utils.utilities import (
35
33
  get_audit,
36
34
  get_nats_manager,
37
35
  get_pubsub,
38
36
  get_storage,
39
- start_nats_manager,
40
37
  )
41
38
 
42
39
  from .auditing import IndexAuditHandler, ResourceWritesAuditHandler
@@ -57,29 +54,8 @@ async def _exit_tasks(tasks: list[asyncio.Task]) -> None:
57
54
  await asyncio.gather(*tasks, return_exceptions=True)
58
55
 
59
56
 
60
- async def start_back_pressure() -> BackPressureMaterializer:
61
- logger.info("Starting back pressure materializer")
62
- nats_manager = await start_nats_manager(
63
- SERVICE_NAME,
64
- indexing_settings.index_jetstream_servers,
65
- indexing_settings.index_jetstream_auth,
66
- )
67
- back_pressure = BackPressureMaterializer(
68
- nats_manager,
69
- indexing_check_interval=back_pressure_settings.indexing_check_interval,
70
- ingest_check_interval=back_pressure_settings.ingest_check_interval,
71
- )
72
- await back_pressure.start()
73
- return back_pressure
74
-
75
-
76
- async def stop_back_pressure(materializer: BackPressureMaterializer) -> None:
77
- await materializer.stop()
78
- await materializer.nats_manager.finalize()
79
-
80
-
81
57
  async def start_ingest_consumers(
82
- service_name: Optional[str] = None,
58
+ service_name: str | None = None,
83
59
  ) -> Callable[[], Awaitable[None]]:
84
60
  if transaction_settings.transaction_local:
85
61
  raise ConfigurationError("Can not start ingest consumers in local mode")
@@ -115,7 +91,7 @@ async def start_ingest_consumers(
115
91
 
116
92
 
117
93
  async def start_ingest_processed_consumer_v2(
118
- service_name: Optional[str] = None,
94
+ service_name: str | None = None,
119
95
  ) -> Callable[[], Awaitable[None]]:
120
96
  """
121
97
  This is not meant to be deployed with a stateful set like the other consumers.
@@ -164,9 +140,8 @@ async def start_shard_creator() -> Callable[[], Awaitable[None]]:
164
140
  driver = await setup_driver()
165
141
  pubsub = await get_pubsub()
166
142
  assert pubsub is not None, "Pubsub is not configured"
167
- storage = await get_storage(service_name=SERVICE_NAME)
168
143
 
169
- shard_creator = ShardCreatorHandler(driver=driver, storage=storage, pubsub=pubsub)
144
+ shard_creator = ShardCreatorHandler(driver=driver, pubsub=pubsub)
170
145
  await shard_creator.initialize()
171
146
 
172
147
  return shard_creator.finalize
@@ -25,14 +25,14 @@ from typing import Any
25
25
 
26
26
  from nidx_protos import nodereader_pb2, noderesources_pb2
27
27
 
28
- from nucliadb.common import locking
28
+ from nucliadb.common import datamanagers, locking
29
+ from nucliadb.common.cluster.settings import settings
29
30
  from nucliadb.common.cluster.utils import get_shard_manager
30
31
  from nucliadb.common.maindb.driver import Driver
31
32
  from nucliadb.common.nidx import get_nidx_api_client
32
33
  from nucliadb_protos import writer_pb2
33
34
  from nucliadb_utils import const
34
35
  from nucliadb_utils.cache.pubsub import PubSubDriver
35
- from nucliadb_utils.storages.storage import Storage
36
36
 
37
37
  from . import metrics
38
38
  from .utils import DelayedTaskHandler
@@ -52,12 +52,10 @@ class ShardCreatorHandler:
52
52
  self,
53
53
  *,
54
54
  driver: Driver,
55
- storage: Storage,
56
55
  pubsub: PubSubDriver,
57
56
  check_delay: float = 10.0,
58
57
  ):
59
58
  self.driver = driver
60
- self.storage = storage
61
59
  self.pubsub = pubsub
62
60
  self.shard_manager = get_shard_manager()
63
61
  self.task_handler = DelayedTaskHandler(check_delay)
@@ -111,4 +109,17 @@ class ShardCreatorHandler:
111
109
  shard_id=noderesources_pb2.ShardId(id=current_shard.nidx_shard_id)
112
110
  ) # type: ignore
113
111
  )
114
- await self.shard_manager.maybe_create_new_shard(kbid, shard.paragraphs)
112
+
113
+ if not should_create_new_shard(shard.paragraphs):
114
+ return
115
+
116
+ logger.info({"message": "Adding shard", "kbid": kbid})
117
+ async with datamanagers.with_rw_transaction() as txn:
118
+ kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
119
+ prewarm = kb_config is not None and kb_config.prewarm_enabled
120
+ await self.shard_manager.create_shard_by_kbid(txn, kbid, prewarm_enabled=prewarm)
121
+ await txn.commit()
122
+
123
+
124
+ def should_create_new_shard(num_paragraphs: int) -> bool:
125
+ return num_paragraphs > settings.max_shard_paragraphs
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
- from typing import Callable, Coroutine
21
+ from collections.abc import Callable, Coroutine
22
22
 
23
23
 
24
24
  class DelayedTaskHandler:
@@ -24,11 +24,12 @@ import enum
24
24
  import logging
25
25
  from collections import defaultdict
26
26
  from datetime import datetime
27
- from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
27
+ from typing import TYPE_CHECKING, Any, Generic, TypeVar
28
28
 
29
29
  from google.protobuf.message import DecodeError, Message
30
30
 
31
31
  from nucliadb.common import datamanagers
32
+ from nucliadb.common.ids import FieldId
32
33
  from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
33
34
  from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
34
35
  from nucliadb_protos.resources_pb2 import (
@@ -46,10 +47,8 @@ from nucliadb_protos.resources_pb2 import (
46
47
  )
47
48
  from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
48
49
  from nucliadb_protos.writer_pb2 import Error, FieldStatus
49
- from nucliadb_utils import const
50
50
  from nucliadb_utils.storages.exceptions import CouldNotCopyNotFound
51
51
  from nucliadb_utils.storages.storage import Storage, StorageField
52
- from nucliadb_utils.utilities import has_feature
53
52
 
54
53
  logger = logging.getLogger(__name__)
55
54
 
@@ -76,27 +75,27 @@ PbType = TypeVar("PbType", bound=Message)
76
75
 
77
76
 
78
77
  class Field(Generic[PbType]):
79
- pbklass: Type[PbType]
78
+ pbklass: type[PbType]
80
79
  type: str = "x"
81
- value: Optional[Any]
82
- extracted_text: Optional[ExtractedText]
83
- extracted_vectors: dict[Optional[str], VectorObject]
84
- computed_metadata: Optional[FieldComputedMetadata]
85
- large_computed_metadata: Optional[LargeComputedMetadata]
86
- question_answers: Optional[FieldQuestionAnswers]
80
+ value: Any | None
81
+ extracted_text: ExtractedText | None
82
+ extracted_vectors: dict[str | None, VectorObject]
83
+ computed_metadata: FieldComputedMetadata | None
84
+ large_computed_metadata: LargeComputedMetadata | None
85
+ question_answers: FieldQuestionAnswers | None
87
86
 
88
87
  def __init__(
89
88
  self,
90
89
  id: str,
91
90
  resource: Resource,
92
- pb: Optional[Any] = None,
93
- value: Optional[Any] = None,
91
+ pb: Any | None = None,
92
+ value: Any | None = None,
94
93
  ):
95
94
  if self.pbklass is None:
96
95
  raise InvalidFieldClass()
97
96
 
98
97
  self.value = None
99
- self.extracted_text: Optional[ExtractedText] = None
98
+ self.extracted_text: ExtractedText | None = None
100
99
  self.extracted_vectors = {}
101
100
  self.computed_metadata = None
102
101
  self.large_computed_metadata = None
@@ -119,12 +118,20 @@ class Field(Generic[PbType]):
119
118
 
120
119
  @property
121
120
  def kbid(self) -> str:
122
- return self.resource.kb.kbid
121
+ return self.resource.kbid
123
122
 
124
123
  @property
125
124
  def uuid(self) -> str:
126
125
  return self.resource.uuid
127
126
 
127
+ @property
128
+ def field_id(self) -> FieldId:
129
+ return FieldId(
130
+ rid=self.resource.uuid,
131
+ type=self.type,
132
+ key=self.id,
133
+ )
134
+
128
135
  @property
129
136
  def storage(self) -> Storage:
130
137
  return self.resource.storage
@@ -152,7 +159,7 @@ class Field(Generic[PbType]):
152
159
 
153
160
  return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, key)
154
161
 
155
- async def db_get_value(self) -> Optional[PbType]:
162
+ async def db_get_value(self) -> PbType | None:
156
163
  if self.value is None:
157
164
  payload = await datamanagers.fields.get_raw(
158
165
  self.resource.txn,
@@ -215,21 +222,6 @@ class Field(Generic[PbType]):
215
222
  ) -> None:
216
223
  # Try delete vectors
217
224
  sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
218
-
219
- if has_feature(const.Features.DEBUG_MISSING_VECTORS):
220
- # This is a very chatty log. It is just a temporary hint while debugging an issue.
221
- logger.info(
222
- "Deleting vectors from storage",
223
- extra={
224
- "kbid": self.kbid,
225
- "rid": self.resource.uuid,
226
- "field": f"{self.type}/{self.id}",
227
- "vectorset": vectorset,
228
- "storage_key_kind": storage_key_kind,
229
- "key": sf.key,
230
- "bucket": sf.bucket,
231
- },
232
- )
233
225
  try:
234
226
  await self.storage.delete_upload(sf.key, sf.bucket)
235
227
  except KeyError:
@@ -242,7 +234,7 @@ class Field(Generic[PbType]):
242
234
  except KeyError:
243
235
  pass
244
236
 
245
- async def get_error(self) -> Optional[Error]:
237
+ async def get_error(self) -> Error | None:
246
238
  return await datamanagers.fields.get_error(
247
239
  self.resource.txn,
248
240
  kbid=self.kbid,
@@ -261,7 +253,7 @@ class Field(Generic[PbType]):
261
253
  error=error,
262
254
  )
263
255
 
264
- async def get_status(self) -> Optional[FieldStatus]:
256
+ async def get_status(self) -> FieldStatus | None:
265
257
  return await datamanagers.fields.get_status(
266
258
  self.resource.txn,
267
259
  kbid=self.kbid,
@@ -280,7 +272,7 @@ class Field(Generic[PbType]):
280
272
  status=status,
281
273
  )
282
274
 
283
- async def get_question_answers(self, force=False) -> Optional[FieldQuestionAnswers]:
275
+ async def get_question_answers(self, force=False) -> FieldQuestionAnswers | None:
284
276
  if self.question_answers is None or force:
285
277
  sf = self.get_storage_field(FieldTypes.QUESTION_ANSWERS)
286
278
  try:
@@ -297,9 +289,7 @@ class Field(Generic[PbType]):
297
289
  async def set_question_answers(self, payload: FieldQuestionAnswerWrapper) -> None:
298
290
  if self.type in SUBFIELDFIELDS:
299
291
  try:
300
- actual_payload: Optional[FieldQuestionAnswers] = await self.get_question_answers(
301
- force=True
302
- )
292
+ actual_payload: FieldQuestionAnswers | None = await self.get_question_answers(force=True)
303
293
  except KeyError:
304
294
  actual_payload = None
305
295
  else:
@@ -332,7 +322,7 @@ class Field(Generic[PbType]):
332
322
  self.question_answers = actual_payload
333
323
 
334
324
  async def set_extracted_text(self, payload: ExtractedTextWrapper) -> None:
335
- actual_payload: Optional[ExtractedText] = None
325
+ actual_payload: ExtractedText | None = None
336
326
  if self.type in SUBFIELDFIELDS:
337
327
  # Try to get the previously extracted text protobuf if it exists so we can merge it with the new splits
338
328
  # coming from the processing payload.
@@ -383,7 +373,7 @@ class Field(Generic[PbType]):
383
373
  await self.storage.upload_pb(sf, actual_payload)
384
374
  self.extracted_text = actual_payload
385
375
 
386
- async def get_extracted_text(self, force=False) -> Optional[ExtractedText]:
376
+ async def get_extracted_text(self, force=False) -> ExtractedText | None:
387
377
  if self.extracted_text is None or force:
388
378
  async with self.locks["extracted_text"]:
389
379
  # Value could have been fetched while waiting for the lock
@@ -399,10 +389,10 @@ class Field(Generic[PbType]):
399
389
  payload: ExtractedVectorsWrapper,
400
390
  vectorset: str,
401
391
  storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
402
- ) -> Optional[VectorObject]:
392
+ ) -> VectorObject | None:
403
393
  if self.type in SUBFIELDFIELDS:
404
394
  try:
405
- actual_payload: Optional[VectorObject] = await self.get_vectors(
395
+ actual_payload: VectorObject | None = await self.get_vectors(
406
396
  vectorset=vectorset,
407
397
  storage_key_kind=storage_key_kind,
408
398
  force=True,
@@ -413,7 +403,7 @@ class Field(Generic[PbType]):
413
403
  actual_payload = None
414
404
 
415
405
  sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
416
- vo: Optional[VectorObject] = None
406
+ vo: VectorObject | None = None
417
407
  if actual_payload is None:
418
408
  # Its first extracted vectors
419
409
  if payload.HasField("file"):
@@ -465,7 +455,7 @@ class Field(Generic[PbType]):
465
455
  vectorset: str,
466
456
  storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
467
457
  force: bool = False,
468
- ) -> Optional[VectorObject]:
458
+ ) -> VectorObject | None:
469
459
  if self.extracted_vectors.get(vectorset, None) is None or force:
470
460
  sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
471
461
  payload = await self.storage.download_pb(sf, VectorObject)
@@ -476,9 +466,7 @@ class Field(Generic[PbType]):
476
466
  async def set_field_metadata(self, payload: FieldComputedMetadataWrapper) -> FieldComputedMetadata:
477
467
  if self.type in SUBFIELDFIELDS:
478
468
  try:
479
- actual_payload: Optional[FieldComputedMetadata] = await self.get_field_metadata(
480
- force=True
481
- )
469
+ actual_payload: FieldComputedMetadata | None = await self.get_field_metadata(force=True)
482
470
  except KeyError:
483
471
  actual_payload = None
484
472
  else:
@@ -521,7 +509,7 @@ class Field(Generic[PbType]):
521
509
 
522
510
  return self.computed_metadata
523
511
 
524
- async def get_field_metadata(self, force: bool = False) -> Optional[FieldComputedMetadata]:
512
+ async def get_field_metadata(self, force: bool = False) -> FieldComputedMetadata | None:
525
513
  if self.computed_metadata is None or force:
526
514
  async with self.locks["field_metadata"]:
527
515
  # Value could have been fetched while waiting for the lock
@@ -535,7 +523,7 @@ class Field(Generic[PbType]):
535
523
  async def set_large_field_metadata(self, payload: LargeComputedMetadataWrapper):
536
524
  if self.type in SUBFIELDFIELDS:
537
525
  try:
538
- actual_payload: Optional[LargeComputedMetadata] = await self.get_large_field_metadata(
526
+ actual_payload: LargeComputedMetadata | None = await self.get_large_field_metadata(
539
527
  force=True
540
528
  )
541
529
  except KeyError:
@@ -545,7 +533,7 @@ class Field(Generic[PbType]):
545
533
 
546
534
  sf = self.get_storage_field(FieldTypes.FIELD_LARGE_METADATA)
547
535
 
548
- new_payload: Optional[LargeComputedMetadata] = None
536
+ new_payload: LargeComputedMetadata | None = None
549
537
  if payload.HasField("file"):
550
538
  new_payload = LargeComputedMetadata()
551
539
  data = await self.storage.downloadbytescf(payload.file)
@@ -572,7 +560,7 @@ class Field(Generic[PbType]):
572
560
 
573
561
  return self.large_computed_metadata
574
562
 
575
- async def get_large_field_metadata(self, force: bool = False) -> Optional[LargeComputedMetadata]:
563
+ async def get_large_field_metadata(self, force: bool = False) -> LargeComputedMetadata | None:
576
564
  if self.large_computed_metadata is None or force:
577
565
  sf = self.get_storage_field(FieldTypes.FIELD_LARGE_METADATA)
578
566
  payload = await self.storage.download_pb(