nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -17,9 +17,10 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ from collections.abc import AsyncGenerator, Callable, Coroutine, Sequence
20
21
  from datetime import datetime
21
22
  from functools import partial
22
- from typing import Any, AsyncGenerator, Callable, Coroutine, Optional, Sequence
23
+ from typing import Any
23
24
  from uuid import uuid4
24
25
 
25
26
  from grpc import StatusCode
@@ -88,7 +89,7 @@ class KnowledgeBox:
88
89
  self.txn = txn
89
90
  self.storage = storage
90
91
  self.kbid = kbid
91
- self._config: Optional[KnowledgeBoxConfig] = None
92
+ self._config: KnowledgeBoxConfig | None = None
92
93
 
93
94
  @staticmethod
94
95
  def new_unique_kbid() -> str:
@@ -248,14 +249,14 @@ class KnowledgeBox:
248
249
  driver: Driver,
249
250
  kbid: str,
250
251
  *,
251
- slug: Optional[str] = None,
252
- title: Optional[str] = None,
253
- description: Optional[str] = None,
254
- migration_version: Optional[int] = None,
255
- external_index_provider: Optional[StoredExternalIndexProviderMetadata] = None,
256
- hidden_resources_enabled: Optional[bool] = None,
257
- hidden_resources_hide_on_creation: Optional[bool] = None,
258
- prewarm_enabled: Optional[bool] = None,
252
+ slug: str | None = None,
253
+ title: str | None = None,
254
+ description: str | None = None,
255
+ migration_version: int | None = None,
256
+ external_index_provider: StoredExternalIndexProviderMetadata | None = None,
257
+ hidden_resources_enabled: bool | None = None,
258
+ hidden_resources_hide_on_creation: bool | None = None,
259
+ prewarm_enabled: bool | None = None,
259
260
  ) -> str:
260
261
  async with driver.rw_transaction() as txn:
261
262
  stored = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
@@ -428,7 +429,7 @@ class KnowledgeBox:
428
429
  await txn.delete_by_prefix(prefix)
429
430
  await txn.commit()
430
431
 
431
- async def get_resource_shard(self, shard_id: str) -> Optional[writer_pb2.ShardObject]:
432
+ async def get_resource_shard(self, shard_id: str) -> writer_pb2.ShardObject | None:
432
433
  async with datamanagers.with_ro_transaction() as txn:
433
434
  pb = await datamanagers.cluster.get_kb_shards(txn, kbid=self.kbid)
434
435
  if pb is None:
@@ -439,18 +440,8 @@ class KnowledgeBox:
439
440
  return shard
440
441
  return None
441
442
 
442
- async def get(self, uuid: str) -> Optional[Resource]:
443
- basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=uuid)
444
- if basic is None:
445
- return None
446
- return Resource(
447
- txn=self.txn,
448
- storage=self.storage,
449
- kb=self,
450
- uuid=uuid,
451
- basic=basic,
452
- disable_vectors=False,
453
- )
443
+ async def get(self, uuid: str) -> Resource | None:
444
+ return await Resource.get(self.txn, self.kbid, uuid)
454
445
 
455
446
  async def maindb_delete_resource(self, uuid: str):
456
447
  basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=uuid)
@@ -479,7 +470,7 @@ class KnowledgeBox:
479
470
  with processor_observer({"type": "delete_resource_storage"}):
480
471
  await self.storage_delete_resource(uuid)
481
472
 
482
- async def get_resource_uuid_by_slug(self, slug: str) -> Optional[str]:
473
+ async def get_resource_uuid_by_slug(self, slug: str) -> str | None:
483
474
  return await datamanagers.resources.get_resource_uuid_from_slug(
484
475
  self.txn, kbid=self.kbid, slug=slug
485
476
  )
@@ -496,7 +487,7 @@ class KnowledgeBox:
496
487
  key_ok = True
497
488
  return slug
498
489
 
499
- async def add_resource(self, uuid: str, slug: str, basic: Optional[Basic] = None) -> Resource:
490
+ async def add_resource(self, uuid: str, slug: str, basic: Basic | None = None) -> Resource:
500
491
  if basic is None:
501
492
  basic = Basic()
502
493
  if slug == "":
@@ -508,7 +499,7 @@ class KnowledgeBox:
508
499
  return Resource(
509
500
  storage=self.storage,
510
501
  txn=self.txn,
511
- kb=self,
502
+ kbid=self.kbid,
512
503
  uuid=uuid,
513
504
  basic=basic,
514
505
  disable_vectors=False,
@@ -523,7 +514,7 @@ class KnowledgeBox:
523
514
  yield Resource(
524
515
  self.txn,
525
516
  self.storage,
526
- self,
517
+ self.kbid,
527
518
  uuid,
528
519
  disable_vectors=False,
529
520
  )
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  from nucliadb.common.maindb.driver import Driver
21
- from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
22
21
  from nucliadb.ingest.orm.resource import Resource
23
22
  from nucliadb_protos import audit_pb2, writer_pb2
24
23
  from nucliadb_protos.resources_pb2 import FieldType
@@ -35,8 +34,7 @@ async def collect_audit_fields(
35
34
 
36
35
  audit_storage_fields: list[audit_pb2.AuditField] = []
37
36
  async with driver.ro_transaction() as txn:
38
- kb = KnowledgeBox(txn, storage, message.kbid)
39
- resource = Resource(txn, storage, kb, message.uuid)
37
+ resource = Resource(txn, storage, message.kbid, message.uuid)
40
38
  field_keys = await resource.get_fields_ids()
41
39
 
42
40
  for field_id, field_type in iterate_auditable_fields(field_keys, message):
@@ -20,7 +20,6 @@
20
20
 
21
21
  import logging
22
22
  from dataclasses import dataclass, field
23
- from typing import Optional
24
23
 
25
24
  from nucliadb.ingest.orm.resource import Resource
26
25
  from nucliadb.ingest.processing import ProcessingEngine
@@ -94,7 +93,7 @@ def _generate_processing_payload_for_fields(
94
93
  rid: str,
95
94
  fields: GeneratedFields,
96
95
  bm: writer_pb2.BrokerMessage,
97
- ) -> Optional[PushPayload]:
96
+ ) -> PushPayload | None:
98
97
  partitioning = get_partitioning()
99
98
  partition = partitioning.generate_partition(kbid, rid)
100
99
 
@@ -19,7 +19,6 @@
19
19
  #
20
20
  import asyncio
21
21
  import logging
22
- from typing import Optional
23
22
 
24
23
  import aiohttp.client_exceptions
25
24
  import nats.errors
@@ -145,8 +144,8 @@ class Processor:
145
144
  self,
146
145
  driver: Driver,
147
146
  storage: Storage,
148
- pubsub: Optional[PubSubDriver] = None,
149
- partition: Optional[str] = None,
147
+ pubsub: PubSubDriver | None = None,
148
+ partition: str | None = None,
150
149
  ):
151
150
  self.driver = driver
152
151
  self.storage = storage
@@ -158,7 +157,7 @@ class Processor:
158
157
  self,
159
158
  message: writer_pb2.BrokerMessage,
160
159
  seqid: int,
161
- partition: Optional[str] = None,
160
+ partition: str | None = None,
162
161
  transaction_check: bool = True,
163
162
  ) -> None:
164
163
  partition = partition if self.partition is None else self.partition
@@ -285,7 +284,7 @@ class Processor:
285
284
  kb = KnowledgeBox(txn, self.storage, kbid)
286
285
  uuid = await self.get_resource_uuid(kb, message)
287
286
 
288
- resource: Optional[Resource] = None
287
+ resource: Resource | None = None
289
288
  handled_exception = None
290
289
  created = False
291
290
 
@@ -446,26 +445,27 @@ class Processor:
446
445
  # a resource was move to another shard while it was being indexed
447
446
  shard_id = await datamanagers.resources.get_resource_shard_id(txn, kbid=kbid, rid=uuid)
448
447
 
449
- shard = None
450
- if shard_id is not None:
451
- # Resource already has a shard assigned
452
- shard = await kb.get_resource_shard(shard_id)
453
- if shard is None:
454
- raise AttributeError("Shard not available")
455
- else:
456
- # It's a new resource, get KB's current active shard to place new resource on
457
- shard = await self.index_node_shard_manager.get_current_active_shard(txn, kbid)
458
- if shard is None:
459
- # No current shard available, create a new one
460
- kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
461
- prewarm = kb_config is not None and kb_config.prewarm_enabled
462
- shard = await self.index_node_shard_manager.create_shard_by_kbid(
463
- txn, kbid, prewarm_enabled=prewarm
448
+ shard = None
449
+ if shard_id is not None:
450
+ # Resource already has a shard assigned
451
+ shard = await kb.get_resource_shard(shard_id)
452
+ if shard is None:
453
+ raise AttributeError("Shard not available")
454
+ else:
455
+ # It's a new resource, get KB's current active shard to place new resource on
456
+ shard = await self.index_node_shard_manager.get_current_active_shard(txn, kbid)
457
+ if shard is None:
458
+ # No current shard available, create a new one
459
+ async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
460
+ kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
461
+ prewarm = kb_config is not None and kb_config.prewarm_enabled
462
+ shard = await self.index_node_shard_manager.create_shard_by_kbid(
463
+ txn, kbid, prewarm_enabled=prewarm
464
+ )
465
+ await datamanagers.resources.set_resource_shard_id(
466
+ txn, kbid=kbid, rid=uuid, shard=shard.shard
464
467
  )
465
- await datamanagers.resources.set_resource_shard_id(
466
- txn, kbid=kbid, rid=uuid, shard=shard.shard
467
- )
468
- return shard
468
+ return shard
469
469
 
470
470
  @processor_observer.wrap({"type": "index_resource"})
471
471
  async def index_resource(
@@ -674,7 +674,7 @@ class Processor:
674
674
  await self.pubsub.publish(channel, payload)
675
675
 
676
676
  async def _mark_resource_error(
677
- self, kb: KnowledgeBox, resource: Optional[Resource], partition: str, seqid: int
677
+ self, kb: KnowledgeBox, resource: Resource | None, partition: str, seqid: int
678
678
  ) -> None:
679
679
  """
680
680
  Unhandled error processing, try to mark resource as error
@@ -695,8 +695,8 @@ class Processor:
695
695
  # XXX: Why are these utility functions here?
696
696
  async def get_kb_obj(
697
697
  self, txn: Transaction, kbid: knowledgebox_pb2.KnowledgeBoxID
698
- ) -> Optional[KnowledgeBox]:
699
- uuid: Optional[str] = kbid.uuid
698
+ ) -> KnowledgeBox | None:
699
+ uuid: str | None = kbid.uuid
700
700
  if uuid == "":
701
701
  uuid = await datamanagers.kb.get_kb_uuid(txn, slug=kbid.slug)
702
702
 
@@ -17,14 +17,13 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  from nucliadb.common.maindb.driver import Driver, Transaction
23
22
 
24
23
  TXNID = "/internal/worker/{worker}"
25
24
 
26
25
 
27
- async def get_last_seqid(driver: Driver, worker: str) -> Optional[int]:
26
+ async def get_last_seqid(driver: Driver, worker: str) -> int | None:
28
27
  """
29
28
  Get last stored sequence id for a worker.
30
29
 
@@ -22,8 +22,9 @@ from __future__ import annotations
22
22
  import asyncio
23
23
  import logging
24
24
  from collections import defaultdict
25
+ from collections.abc import Sequence
25
26
  from concurrent.futures import ThreadPoolExecutor
26
- from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
27
+ from typing import Any
27
28
 
28
29
  from nucliadb.common import datamanagers
29
30
  from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
@@ -68,13 +69,11 @@ from nucliadb_protos.resources_pb2 import Origin as PBOrigin
68
69
  from nucliadb_protos.resources_pb2 import Relations as PBRelations
69
70
  from nucliadb_protos.writer_pb2 import BrokerMessage
70
71
  from nucliadb_utils.storages.storage import Storage
71
-
72
- if TYPE_CHECKING: # pragma: no cover
73
- from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
72
+ from nucliadb_utils.utilities import get_storage
74
73
 
75
74
  logger = logging.getLogger(__name__)
76
75
 
77
- KB_FIELDS: dict[int, Type] = {
76
+ KB_FIELDS: dict[int, type] = {
78
77
  FieldType.TEXT: Text,
79
78
  FieldType.FILE: File,
80
79
  FieldType.LINK: Link,
@@ -104,40 +103,55 @@ class Resource:
104
103
  self,
105
104
  txn: Transaction,
106
105
  storage: Storage,
107
- kb: KnowledgeBox,
106
+ kbid: str,
108
107
  uuid: str,
109
- basic: Optional[PBBasic] = None,
108
+ basic: PBBasic | None = None,
110
109
  disable_vectors: bool = True,
111
110
  ):
112
111
  self.fields: dict[tuple[FieldType.ValueType, str], Field] = {}
113
112
  self.conversations: dict[int, PBConversation] = {}
114
- self.relations: Optional[PBRelations] = None
115
- self.all_fields_keys: Optional[list[tuple[FieldType.ValueType, str]]] = None
116
- self.origin: Optional[PBOrigin] = None
117
- self.extra: Optional[PBExtra] = None
118
- self.security: Optional[utils_pb2.Security] = None
113
+ self.relations: PBRelations | None = None
114
+ self.all_fields_keys: list[tuple[FieldType.ValueType, str]] | None = None
115
+ self.origin: PBOrigin | None = None
116
+ self.extra: PBExtra | None = None
117
+ self.security: utils_pb2.Security | None = None
119
118
  self.modified: bool = False
120
119
  self._modified_extracted_text: list[FieldID] = []
121
120
 
122
121
  self.txn = txn
123
122
  self.storage = storage
124
- self.kb = kb
123
+ self.kbid = kbid
125
124
  self.uuid = uuid
126
125
  self.basic = basic
127
126
  self.disable_vectors = disable_vectors
128
- self._previous_status: Optional[Metadata.Status.ValueType] = None
129
- self.user_relations: Optional[PBRelations] = None
127
+ self._previous_status: Metadata.Status.ValueType | None = None
128
+ self.user_relations: PBRelations | None = None
130
129
  self.locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
131
130
 
131
+ @classmethod
132
+ async def get(cls, txn: Transaction, kbid: str, rid: str) -> Resource | None:
133
+ basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=rid)
134
+ if basic is None:
135
+ return None
136
+ storage = await get_storage()
137
+ return cls(
138
+ txn=txn,
139
+ storage=storage,
140
+ kbid=kbid,
141
+ uuid=rid,
142
+ basic=basic,
143
+ disable_vectors=False,
144
+ )
145
+
132
146
  async def set_slug(self):
133
147
  basic = await self.get_basic()
134
- new_key = KB_RESOURCE_SLUG.format(kbid=self.kb.kbid, slug=basic.slug)
148
+ new_key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug)
135
149
  await self.txn.set(new_key, self.uuid.encode())
136
150
 
137
151
  # Basic
138
152
  async def get_basic(self) -> PBBasic:
139
153
  if self.basic is None:
140
- basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
154
+ basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=self.uuid)
141
155
  self.basic = basic if basic is not None else PBBasic()
142
156
  return self.basic
143
157
 
@@ -159,7 +173,7 @@ class Resource:
159
173
  async def set_basic(
160
174
  self,
161
175
  payload: PBBasic,
162
- deleted_fields: Optional[list[FieldID]] = None,
176
+ deleted_fields: list[FieldID] | None = None,
163
177
  ):
164
178
  await self.get_basic()
165
179
 
@@ -212,49 +226,43 @@ class Resource:
212
226
  if deleted_fields is not None and len(deleted_fields) > 0:
213
227
  delete_basic_computedmetadata_classifications(self.basic, deleted_fields=deleted_fields)
214
228
 
215
- await datamanagers.resources.set_basic(
216
- self.txn, kbid=self.kb.kbid, rid=self.uuid, basic=self.basic
217
- )
229
+ await datamanagers.resources.set_basic(self.txn, kbid=self.kbid, rid=self.uuid, basic=self.basic)
218
230
  self.modified = True
219
231
 
220
232
  # Origin
221
- async def get_origin(self) -> Optional[PBOrigin]:
233
+ async def get_origin(self) -> PBOrigin | None:
222
234
  if self.origin is None:
223
- origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kb.kbid, rid=self.uuid)
235
+ origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kbid, rid=self.uuid)
224
236
  self.origin = origin
225
237
  return self.origin
226
238
 
227
239
  async def set_origin(self, payload: PBOrigin):
228
- await datamanagers.resources.set_origin(
229
- self.txn, kbid=self.kb.kbid, rid=self.uuid, origin=payload
230
- )
240
+ await datamanagers.resources.set_origin(self.txn, kbid=self.kbid, rid=self.uuid, origin=payload)
231
241
  self.modified = True
232
242
  self.origin = payload
233
243
 
234
244
  # Extra
235
- async def get_extra(self) -> Optional[PBExtra]:
245
+ async def get_extra(self) -> PBExtra | None:
236
246
  if self.extra is None:
237
- extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid)
247
+ extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kbid, rid=self.uuid)
238
248
  self.extra = extra
239
249
  return self.extra
240
250
 
241
251
  async def set_extra(self, payload: PBExtra):
242
- await datamanagers.resources.set_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload)
252
+ await datamanagers.resources.set_extra(self.txn, kbid=self.kbid, rid=self.uuid, extra=payload)
243
253
  self.modified = True
244
254
  self.extra = payload
245
255
 
246
256
  # Security
247
- async def get_security(self) -> Optional[utils_pb2.Security]:
257
+ async def get_security(self) -> utils_pb2.Security | None:
248
258
  if self.security is None:
249
- security = await datamanagers.resources.get_security(
250
- self.txn, kbid=self.kb.kbid, rid=self.uuid
251
- )
259
+ security = await datamanagers.resources.get_security(self.txn, kbid=self.kbid, rid=self.uuid)
252
260
  self.security = security
253
261
  return self.security
254
262
 
255
263
  async def set_security(self, payload: utils_pb2.Security) -> None:
256
264
  await datamanagers.resources.set_security(
257
- self.txn, kbid=self.kb.kbid, rid=self.uuid, security=payload
265
+ self.txn, kbid=self.kbid, rid=self.uuid, security=payload
258
266
  )
259
267
  self.modified = True
260
268
  self.security = payload
@@ -262,7 +270,7 @@ class Resource:
262
270
  # Relations
263
271
  async def get_user_relations(self) -> PBRelations:
264
272
  if self.user_relations is None:
265
- sf = self.storage.user_relations(self.kb.kbid, self.uuid)
273
+ sf = self.storage.user_relations(self.kbid, self.uuid)
266
274
  relations = await self.storage.download_pb(sf, PBRelations)
267
275
  if relations is None:
268
276
  # Key not found = no relations
@@ -272,7 +280,7 @@ class Resource:
272
280
  return self.user_relations
273
281
 
274
282
  async def set_user_relations(self, payload: PBRelations):
275
- sf = self.storage.user_relations(self.kb.kbid, self.uuid)
283
+ sf = self.storage.user_relations(self.kbid, self.uuid)
276
284
  await self.storage.upload_pb(sf, payload)
277
285
  self.modified = True
278
286
  self.user_relations = payload
@@ -366,22 +374,22 @@ class Resource:
366
374
  # REVIEW: are we sure we don't want to actually check this?
367
375
  return (type, field) in self.fields
368
376
 
369
- async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
377
+ async def get_all_field_ids(self, *, for_update: bool) -> PBAllFieldIDs | None:
370
378
  return await datamanagers.resources.get_all_field_ids(
371
- self.txn, kbid=self.kb.kbid, rid=self.uuid, for_update=for_update
379
+ self.txn, kbid=self.kbid, rid=self.uuid, for_update=for_update
372
380
  )
373
381
 
374
382
  async def set_all_field_ids(self, all_fields: PBAllFieldIDs):
375
383
  return await datamanagers.resources.set_all_field_ids(
376
- self.txn, kbid=self.kb.kbid, rid=self.uuid, allfields=all_fields
384
+ self.txn, kbid=self.kbid, rid=self.uuid, allfields=all_fields
377
385
  )
378
386
 
379
387
  async def update_all_field_ids(
380
388
  self,
381
389
  *,
382
- updated: Optional[list[FieldID]] = None,
383
- deleted: Optional[list[FieldID]] = None,
384
- errors: Optional[list[writer_pb2.Error]] = None,
390
+ updated: list[FieldID] | None = None,
391
+ deleted: list[FieldID] | None = None,
392
+ errors: list[writer_pb2.Error] | None = None,
385
393
  ):
386
394
  needs_update = False
387
395
  all_fields = await self.get_all_field_ids(for_update=True)
@@ -460,7 +468,7 @@ class Resource:
460
468
 
461
469
  # If this message comes from the processor (not a DA worker), we clear all previous errors
462
470
  # TODO: When generated_by is populated with DA tasks by processor, remove only related errors
463
- from_processor = any((x.WhichOneof("generator") == "processor" for x in message.generated_by))
471
+ from_processor = any(x.WhichOneof("generator") == "processor" for x in message.generated_by)
464
472
 
465
473
  for (field_type, field), errors in errors_by_field.items():
466
474
  field_obj = await self.get_field(field, field_type, load=False)
@@ -480,7 +488,7 @@ class Resource:
480
488
  # We infer the status for processor messages
481
489
  if message.source == BrokerMessage.MessageSource.PROCESSOR:
482
490
  if any(
483
- (e.source_error.severity == writer_pb2.Error.Severity.ERROR for e in status.errors)
491
+ e.source_error.severity == writer_pb2.Error.Severity.ERROR for e in status.errors
484
492
  ):
485
493
  status.status = writer_pb2.FieldStatus.Status.ERROR
486
494
  else:
@@ -510,25 +518,21 @@ class Resource:
510
518
  return
511
519
 
512
520
  field_statuses = await datamanagers.fields.get_statuses(
513
- self.txn, kbid=self.kb.kbid, rid=self.uuid, fields=field_ids.fields
521
+ self.txn, kbid=self.kbid, rid=self.uuid, fields=field_ids.fields
514
522
  )
515
523
 
516
524
  # If any field is processing -> PENDING
517
- if any((f.status == writer_pb2.FieldStatus.Status.PENDING for f in field_statuses)):
525
+ if any(f.status == writer_pb2.FieldStatus.Status.PENDING for f in field_statuses):
518
526
  self.basic.metadata.status = PBMetadata.Status.PENDING
519
527
  # If we have any non-DA error -> ERROR
520
528
  elif any(
521
- (
522
- f.status == writer_pb2.FieldStatus.Status.ERROR
523
- and any(
524
- (
525
- e.source_error.severity == writer_pb2.Error.Severity.ERROR
526
- and e.source_error.code != writer_pb2.Error.ErrorCode.DATAAUGMENTATION
527
- for e in f.errors
528
- )
529
- )
530
- for f in field_statuses
529
+ f.status == writer_pb2.FieldStatus.Status.ERROR
530
+ and any(
531
+ e.source_error.severity == writer_pb2.Error.Severity.ERROR
532
+ and e.source_error.code != writer_pb2.Error.ErrorCode.DATAAUGMENTATION
533
+ for e in f.errors
531
534
  )
535
+ for f in field_statuses
532
536
  ):
533
537
  self.basic.metadata.status = PBMetadata.Status.ERROR
534
538
  # Otherwise (everything processed or we only have DA errors) -> PROCESSED
@@ -651,7 +655,7 @@ class Resource:
651
655
  FieldType.LINK,
652
656
  load=False,
653
657
  )
654
- maybe_update_basic_thumbnail(self.basic, link_extracted_data.link_thumbnail, self.kb.kbid)
658
+ maybe_update_basic_thumbnail(self.basic, link_extracted_data.link_thumbnail, self.kbid)
655
659
 
656
660
  await field_link.set_link_extracted_data(link_extracted_data)
657
661
 
@@ -678,7 +682,7 @@ class Resource:
678
682
  return
679
683
  logger.info(
680
684
  "Updating resource title from link extracted data",
681
- extra={"kbid": self.kb.kbid, "field": link_extracted_data.field, "rid": self.uuid},
685
+ extra={"kbid": self.kbid, "field": link_extracted_data.field, "rid": self.uuid},
682
686
  )
683
687
  title = link_extracted_data.title
684
688
  await self.update_resource_title(title)
@@ -720,7 +724,7 @@ class Resource:
720
724
  # uri can change after extraction
721
725
  await field_file.set_file_extracted_data(file_extracted_data)
722
726
  maybe_update_basic_icon(self.basic, file_extracted_data.icon)
723
- maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail, self.kb.kbid)
727
+ maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail, self.kbid)
724
728
  self.modified = True
725
729
 
726
730
  async def _should_update_resource_title_from_file_metadata(self) -> bool:
@@ -742,7 +746,7 @@ class Resource:
742
746
  filenames = set()
743
747
  for (field_type, _), field_obj in fields.items():
744
748
  if field_type == FieldType.FILE:
745
- field_value: Optional[FieldFile] = await field_obj.get_value()
749
+ field_value: FieldFile | None = await field_obj.get_value()
746
750
  if field_value is not None:
747
751
  if field_value.file.filename not in ("", None):
748
752
  filenames.add(field_value.file.filename)
@@ -767,7 +771,7 @@ class Resource:
767
771
  fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
768
772
  logger.info(
769
773
  "Updating resource title from file extracted data",
770
- extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
774
+ extra={"kbid": self.kbid, "field": fid.full(), "new_title": fed.title},
771
775
  )
772
776
  await self.update_resource_title(fed.title)
773
777
  await self.unmark_title_for_reset()
@@ -785,9 +789,7 @@ class Resource:
785
789
  )
786
790
  await field_obj.set_field_metadata(field_metadata)
787
791
 
788
- maybe_update_basic_thumbnail(
789
- self.basic, field_metadata.metadata.metadata.thumbnail, self.kb.kbid
790
- )
792
+ maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail, self.kbid)
791
793
 
792
794
  update_basic_computedmetadata_classifications(self.basic, field_metadata)
793
795
  self.modified = True
@@ -799,7 +801,7 @@ class Resource:
799
801
  await self.get_fields(force=True)
800
802
  vectorsets = {
801
803
  vectorset_id: vs
802
- async for vectorset_id, vs in datamanagers.vectorsets.iter(self.txn, kbid=self.kb.kbid)
804
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(self.txn, kbid=self.kbid)
803
805
  }
804
806
 
805
807
  for field_vectors in fields_vectors:
@@ -808,13 +810,13 @@ class Resource:
808
810
  assert len(vectorsets) == 1, (
809
811
  "Invalid broker message, can't ingest vectors from unknown vectorset to KB with multiple vectorsets"
810
812
  )
811
- vectorset = list(vectorsets.values())[0]
813
+ vectorset = next(iter(vectorsets.values()))
812
814
 
813
815
  else:
814
816
  if field_vectors.vectorset_id not in vectorsets:
815
817
  logger.warning(
816
818
  "Dropping extracted vectors for unknown vectorset",
817
- extra={"kbid": self.kb.kbid, "vectorset": field_vectors.vectorset_id},
819
+ extra={"kbid": self.kbid, "vectorset": field_vectors.vectorset_id},
818
820
  )
819
821
  continue
820
822
 
@@ -925,7 +927,7 @@ def maybe_update_basic_summary(basic: PBBasic, summary_text: str) -> bool:
925
927
  return True
926
928
 
927
929
 
928
- def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
930
+ def maybe_update_basic_icon(basic: PBBasic, mimetype: str | None) -> bool:
929
931
  if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
930
932
  # Icon already set or detected
931
933
  return False
@@ -944,7 +946,7 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
944
946
  return True
945
947
 
946
948
 
947
- def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: Optional[CloudFile], kbid: str) -> bool:
949
+ def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: CloudFile | None, kbid: str) -> bool:
948
950
  if basic.thumbnail or thumbnail is None:
949
951
  return False
950
952
  basic.thumbnail = CloudLink.format_reader_download_uri(thumbnail.uri)
@@ -981,7 +983,7 @@ def update_basic_languages(basic: Basic, languages: list[str]) -> bool:
981
983
  return updated
982
984
 
983
985
 
984
- def get_text_field_mimetype(bm: BrokerMessage) -> Optional[str]:
986
+ def get_text_field_mimetype(bm: BrokerMessage) -> str | None:
985
987
  if len(bm.texts) == 0:
986
988
  return None
987
989
  text_format = next(iter(bm.texts.values())).format
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
 
20
20
  import urllib.parse
21
- from typing import Sequence
21
+ from collections.abc import Sequence
22
22
 
23
23
  from nucliadb.models.internal.processing import PushPayload, PushTextFormat, Text
24
24
  from nucliadb_protos.resources_pb2 import (