nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -22,8 +22,9 @@ from __future__ import annotations
22
22
  import asyncio
23
23
  import logging
24
24
  from collections import defaultdict
25
+ from collections.abc import Sequence
25
26
  from concurrent.futures import ThreadPoolExecutor
26
- from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
27
+ from typing import Any
27
28
 
28
29
  from nucliadb.common import datamanagers
29
30
  from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
@@ -68,13 +69,11 @@ from nucliadb_protos.resources_pb2 import Origin as PBOrigin
68
69
  from nucliadb_protos.resources_pb2 import Relations as PBRelations
69
70
  from nucliadb_protos.writer_pb2 import BrokerMessage
70
71
  from nucliadb_utils.storages.storage import Storage
71
-
72
- if TYPE_CHECKING: # pragma: no cover
73
- from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
72
+ from nucliadb_utils.utilities import get_storage
74
73
 
75
74
  logger = logging.getLogger(__name__)
76
75
 
77
- KB_FIELDS: dict[int, Type] = {
76
+ KB_FIELDS: dict[int, type] = {
78
77
  FieldType.TEXT: Text,
79
78
  FieldType.FILE: File,
80
79
  FieldType.LINK: Link,
@@ -104,40 +103,55 @@ class Resource:
104
103
  self,
105
104
  txn: Transaction,
106
105
  storage: Storage,
107
- kb: KnowledgeBox,
106
+ kbid: str,
108
107
  uuid: str,
109
- basic: Optional[PBBasic] = None,
108
+ basic: PBBasic | None = None,
110
109
  disable_vectors: bool = True,
111
110
  ):
112
111
  self.fields: dict[tuple[FieldType.ValueType, str], Field] = {}
113
112
  self.conversations: dict[int, PBConversation] = {}
114
- self.relations: Optional[PBRelations] = None
115
- self.all_fields_keys: Optional[list[tuple[FieldType.ValueType, str]]] = None
116
- self.origin: Optional[PBOrigin] = None
117
- self.extra: Optional[PBExtra] = None
118
- self.security: Optional[utils_pb2.Security] = None
113
+ self.relations: PBRelations | None = None
114
+ self.all_fields_keys: list[tuple[FieldType.ValueType, str]] | None = None
115
+ self.origin: PBOrigin | None = None
116
+ self.extra: PBExtra | None = None
117
+ self.security: utils_pb2.Security | None = None
119
118
  self.modified: bool = False
120
119
  self._modified_extracted_text: list[FieldID] = []
121
120
 
122
121
  self.txn = txn
123
122
  self.storage = storage
124
- self.kb = kb
123
+ self.kbid = kbid
125
124
  self.uuid = uuid
126
125
  self.basic = basic
127
126
  self.disable_vectors = disable_vectors
128
- self._previous_status: Optional[Metadata.Status.ValueType] = None
129
- self.user_relations: Optional[PBRelations] = None
127
+ self._previous_status: Metadata.Status.ValueType | None = None
128
+ self.user_relations: PBRelations | None = None
130
129
  self.locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
131
130
 
131
+ @classmethod
132
+ async def get(cls, txn: Transaction, kbid: str, rid: str) -> Resource | None:
133
+ basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=rid)
134
+ if basic is None:
135
+ return None
136
+ storage = await get_storage()
137
+ return cls(
138
+ txn=txn,
139
+ storage=storage,
140
+ kbid=kbid,
141
+ uuid=rid,
142
+ basic=basic,
143
+ disable_vectors=False,
144
+ )
145
+
132
146
  async def set_slug(self):
133
147
  basic = await self.get_basic()
134
- new_key = KB_RESOURCE_SLUG.format(kbid=self.kb.kbid, slug=basic.slug)
148
+ new_key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug)
135
149
  await self.txn.set(new_key, self.uuid.encode())
136
150
 
137
151
  # Basic
138
- async def get_basic(self) -> Optional[PBBasic]:
152
+ async def get_basic(self) -> PBBasic:
139
153
  if self.basic is None:
140
- basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
154
+ basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=self.uuid)
141
155
  self.basic = basic if basic is not None else PBBasic()
142
156
  return self.basic
143
157
 
@@ -159,7 +173,7 @@ class Resource:
159
173
  async def set_basic(
160
174
  self,
161
175
  payload: PBBasic,
162
- deleted_fields: Optional[list[FieldID]] = None,
176
+ deleted_fields: list[FieldID] | None = None,
163
177
  ):
164
178
  await self.get_basic()
165
179
 
@@ -212,49 +226,43 @@ class Resource:
212
226
  if deleted_fields is not None and len(deleted_fields) > 0:
213
227
  delete_basic_computedmetadata_classifications(self.basic, deleted_fields=deleted_fields)
214
228
 
215
- await datamanagers.resources.set_basic(
216
- self.txn, kbid=self.kb.kbid, rid=self.uuid, basic=self.basic
217
- )
229
+ await datamanagers.resources.set_basic(self.txn, kbid=self.kbid, rid=self.uuid, basic=self.basic)
218
230
  self.modified = True
219
231
 
220
232
  # Origin
221
- async def get_origin(self) -> Optional[PBOrigin]:
233
+ async def get_origin(self) -> PBOrigin | None:
222
234
  if self.origin is None:
223
- origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kb.kbid, rid=self.uuid)
235
+ origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kbid, rid=self.uuid)
224
236
  self.origin = origin
225
237
  return self.origin
226
238
 
227
239
  async def set_origin(self, payload: PBOrigin):
228
- await datamanagers.resources.set_origin(
229
- self.txn, kbid=self.kb.kbid, rid=self.uuid, origin=payload
230
- )
240
+ await datamanagers.resources.set_origin(self.txn, kbid=self.kbid, rid=self.uuid, origin=payload)
231
241
  self.modified = True
232
242
  self.origin = payload
233
243
 
234
244
  # Extra
235
- async def get_extra(self) -> Optional[PBExtra]:
245
+ async def get_extra(self) -> PBExtra | None:
236
246
  if self.extra is None:
237
- extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid)
247
+ extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kbid, rid=self.uuid)
238
248
  self.extra = extra
239
249
  return self.extra
240
250
 
241
251
  async def set_extra(self, payload: PBExtra):
242
- await datamanagers.resources.set_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload)
252
+ await datamanagers.resources.set_extra(self.txn, kbid=self.kbid, rid=self.uuid, extra=payload)
243
253
  self.modified = True
244
254
  self.extra = payload
245
255
 
246
256
  # Security
247
- async def get_security(self) -> Optional[utils_pb2.Security]:
257
+ async def get_security(self) -> utils_pb2.Security | None:
248
258
  if self.security is None:
249
- security = await datamanagers.resources.get_security(
250
- self.txn, kbid=self.kb.kbid, rid=self.uuid
251
- )
259
+ security = await datamanagers.resources.get_security(self.txn, kbid=self.kbid, rid=self.uuid)
252
260
  self.security = security
253
261
  return self.security
254
262
 
255
263
  async def set_security(self, payload: utils_pb2.Security) -> None:
256
264
  await datamanagers.resources.set_security(
257
- self.txn, kbid=self.kb.kbid, rid=self.uuid, security=payload
265
+ self.txn, kbid=self.kbid, rid=self.uuid, security=payload
258
266
  )
259
267
  self.modified = True
260
268
  self.security = payload
@@ -262,7 +270,7 @@ class Resource:
262
270
  # Relations
263
271
  async def get_user_relations(self) -> PBRelations:
264
272
  if self.user_relations is None:
265
- sf = self.storage.user_relations(self.kb.kbid, self.uuid)
273
+ sf = self.storage.user_relations(self.kbid, self.uuid)
266
274
  relations = await self.storage.download_pb(sf, PBRelations)
267
275
  if relations is None:
268
276
  # Key not found = no relations
@@ -272,7 +280,7 @@ class Resource:
272
280
  return self.user_relations
273
281
 
274
282
  async def set_user_relations(self, payload: PBRelations):
275
- sf = self.storage.user_relations(self.kb.kbid, self.uuid)
283
+ sf = self.storage.user_relations(self.kbid, self.uuid)
276
284
  await self.storage.upload_pb(sf, payload)
277
285
  self.modified = True
278
286
  self.user_relations = payload
@@ -354,25 +362,34 @@ class Resource:
354
362
 
355
363
  await field_obj.delete()
356
364
 
365
+ async def field_exists(self, type: FieldType.ValueType, field: str) -> bool:
366
+ """Return whether this resource has this field or not."""
367
+ all_fields_ids = await self.get_fields_ids()
368
+ for field_type, field_id in all_fields_ids:
369
+ if field_type == type and field_id == field:
370
+ return True
371
+ return False
372
+
357
373
  def has_field(self, type: FieldType.ValueType, field: str) -> bool:
374
+ # REVIEW: are we sure we don't want to actually check this?
358
375
  return (type, field) in self.fields
359
376
 
360
- async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
377
+ async def get_all_field_ids(self, *, for_update: bool) -> PBAllFieldIDs | None:
361
378
  return await datamanagers.resources.get_all_field_ids(
362
- self.txn, kbid=self.kb.kbid, rid=self.uuid, for_update=for_update
379
+ self.txn, kbid=self.kbid, rid=self.uuid, for_update=for_update
363
380
  )
364
381
 
365
382
  async def set_all_field_ids(self, all_fields: PBAllFieldIDs):
366
383
  return await datamanagers.resources.set_all_field_ids(
367
- self.txn, kbid=self.kb.kbid, rid=self.uuid, allfields=all_fields
384
+ self.txn, kbid=self.kbid, rid=self.uuid, allfields=all_fields
368
385
  )
369
386
 
370
387
  async def update_all_field_ids(
371
388
  self,
372
389
  *,
373
- updated: Optional[list[FieldID]] = None,
374
- deleted: Optional[list[FieldID]] = None,
375
- errors: Optional[list[writer_pb2.Error]] = None,
390
+ updated: list[FieldID] | None = None,
391
+ deleted: list[FieldID] | None = None,
392
+ errors: list[writer_pb2.Error] | None = None,
376
393
  ):
377
394
  needs_update = False
378
395
  all_fields = await self.get_all_field_ids(for_update=True)
@@ -451,7 +468,7 @@ class Resource:
451
468
 
452
469
  # If this message comes from the processor (not a DA worker), we clear all previous errors
453
470
  # TODO: When generated_by is populated with DA tasks by processor, remove only related errors
454
- from_processor = any((x.WhichOneof("generator") == "processor" for x in message.generated_by))
471
+ from_processor = any(x.WhichOneof("generator") == "processor" for x in message.generated_by)
455
472
 
456
473
  for (field_type, field), errors in errors_by_field.items():
457
474
  field_obj = await self.get_field(field, field_type, load=False)
@@ -471,7 +488,7 @@ class Resource:
471
488
  # We infer the status for processor messages
472
489
  if message.source == BrokerMessage.MessageSource.PROCESSOR:
473
490
  if any(
474
- (e.source_error.severity == writer_pb2.Error.Severity.ERROR for e in status.errors)
491
+ e.source_error.severity == writer_pb2.Error.Severity.ERROR for e in status.errors
475
492
  ):
476
493
  status.status = writer_pb2.FieldStatus.Status.ERROR
477
494
  else:
@@ -501,25 +518,21 @@ class Resource:
501
518
  return
502
519
 
503
520
  field_statuses = await datamanagers.fields.get_statuses(
504
- self.txn, kbid=self.kb.kbid, rid=self.uuid, fields=field_ids.fields
521
+ self.txn, kbid=self.kbid, rid=self.uuid, fields=field_ids.fields
505
522
  )
506
523
 
507
524
  # If any field is processing -> PENDING
508
- if any((f.status == writer_pb2.FieldStatus.Status.PENDING for f in field_statuses)):
525
+ if any(f.status == writer_pb2.FieldStatus.Status.PENDING for f in field_statuses):
509
526
  self.basic.metadata.status = PBMetadata.Status.PENDING
510
527
  # If we have any non-DA error -> ERROR
511
528
  elif any(
512
- (
513
- f.status == writer_pb2.FieldStatus.Status.ERROR
514
- and any(
515
- (
516
- e.source_error.severity == writer_pb2.Error.Severity.ERROR
517
- and e.source_error.code != writer_pb2.Error.ErrorCode.DATAAUGMENTATION
518
- for e in f.errors
519
- )
520
- )
521
- for f in field_statuses
529
+ f.status == writer_pb2.FieldStatus.Status.ERROR
530
+ and any(
531
+ e.source_error.severity == writer_pb2.Error.Severity.ERROR
532
+ and e.source_error.code != writer_pb2.Error.ErrorCode.DATAAUGMENTATION
533
+ for e in f.errors
522
534
  )
535
+ for f in field_statuses
523
536
  ):
524
537
  self.basic.metadata.status = PBMetadata.Status.ERROR
525
538
  # Otherwise (everything processed or we only have DA errors) -> PROCESSED
@@ -642,7 +655,7 @@ class Resource:
642
655
  FieldType.LINK,
643
656
  load=False,
644
657
  )
645
- maybe_update_basic_thumbnail(self.basic, link_extracted_data.link_thumbnail, self.kb.kbid)
658
+ maybe_update_basic_thumbnail(self.basic, link_extracted_data.link_thumbnail, self.kbid)
646
659
 
647
660
  await field_link.set_link_extracted_data(link_extracted_data)
648
661
 
@@ -669,7 +682,7 @@ class Resource:
669
682
  return
670
683
  logger.info(
671
684
  "Updating resource title from link extracted data",
672
- extra={"kbid": self.kb.kbid, "field": link_extracted_data.field, "rid": self.uuid},
685
+ extra={"kbid": self.kbid, "field": link_extracted_data.field, "rid": self.uuid},
673
686
  )
674
687
  title = link_extracted_data.title
675
688
  await self.update_resource_title(title)
@@ -711,7 +724,7 @@ class Resource:
711
724
  # uri can change after extraction
712
725
  await field_file.set_file_extracted_data(file_extracted_data)
713
726
  maybe_update_basic_icon(self.basic, file_extracted_data.icon)
714
- maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail, self.kb.kbid)
727
+ maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail, self.kbid)
715
728
  self.modified = True
716
729
 
717
730
  async def _should_update_resource_title_from_file_metadata(self) -> bool:
@@ -733,7 +746,7 @@ class Resource:
733
746
  filenames = set()
734
747
  for (field_type, _), field_obj in fields.items():
735
748
  if field_type == FieldType.FILE:
736
- field_value: Optional[FieldFile] = await field_obj.get_value()
749
+ field_value: FieldFile | None = await field_obj.get_value()
737
750
  if field_value is not None:
738
751
  if field_value.file.filename not in ("", None):
739
752
  filenames.add(field_value.file.filename)
@@ -758,7 +771,7 @@ class Resource:
758
771
  fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
759
772
  logger.info(
760
773
  "Updating resource title from file extracted data",
761
- extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
774
+ extra={"kbid": self.kbid, "field": fid.full(), "new_title": fed.title},
762
775
  )
763
776
  await self.update_resource_title(fed.title)
764
777
  await self.unmark_title_for_reset()
@@ -776,9 +789,7 @@ class Resource:
776
789
  )
777
790
  await field_obj.set_field_metadata(field_metadata)
778
791
 
779
- maybe_update_basic_thumbnail(
780
- self.basic, field_metadata.metadata.metadata.thumbnail, self.kb.kbid
781
- )
792
+ maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail, self.kbid)
782
793
 
783
794
  update_basic_computedmetadata_classifications(self.basic, field_metadata)
784
795
  self.modified = True
@@ -790,7 +801,7 @@ class Resource:
790
801
  await self.get_fields(force=True)
791
802
  vectorsets = {
792
803
  vectorset_id: vs
793
- async for vectorset_id, vs in datamanagers.vectorsets.iter(self.txn, kbid=self.kb.kbid)
804
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(self.txn, kbid=self.kbid)
794
805
  }
795
806
 
796
807
  for field_vectors in fields_vectors:
@@ -799,13 +810,13 @@ class Resource:
799
810
  assert len(vectorsets) == 1, (
800
811
  "Invalid broker message, can't ingest vectors from unknown vectorset to KB with multiple vectorsets"
801
812
  )
802
- vectorset = list(vectorsets.values())[0]
813
+ vectorset = next(iter(vectorsets.values()))
803
814
 
804
815
  else:
805
816
  if field_vectors.vectorset_id not in vectorsets:
806
817
  logger.warning(
807
818
  "Dropping extracted vectors for unknown vectorset",
808
- extra={"kbid": self.kb.kbid, "vectorset": field_vectors.vectorset_id},
819
+ extra={"kbid": self.kbid, "vectorset": field_vectors.vectorset_id},
809
820
  )
810
821
  continue
811
822
 
@@ -916,7 +927,7 @@ def maybe_update_basic_summary(basic: PBBasic, summary_text: str) -> bool:
916
927
  return True
917
928
 
918
929
 
919
- def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
930
+ def maybe_update_basic_icon(basic: PBBasic, mimetype: str | None) -> bool:
920
931
  if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
921
932
  # Icon already set or detected
922
933
  return False
@@ -935,7 +946,7 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
935
946
  return True
936
947
 
937
948
 
938
- def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: Optional[CloudFile], kbid: str) -> bool:
949
+ def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: CloudFile | None, kbid: str) -> bool:
939
950
  if basic.thumbnail or thumbnail is None:
940
951
  return False
941
952
  basic.thumbnail = CloudLink.format_reader_download_uri(thumbnail.uri)
@@ -972,7 +983,7 @@ def update_basic_languages(basic: Basic, languages: list[str]) -> bool:
972
983
  return updated
973
984
 
974
985
 
975
- def get_text_field_mimetype(bm: BrokerMessage) -> Optional[str]:
986
+ def get_text_field_mimetype(bm: BrokerMessage) -> str | None:
976
987
  if len(bm.texts) == 0:
977
988
  return None
978
989
  text_format = next(iter(bm.texts.values())).format
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
 
20
20
  import urllib.parse
21
- from typing import Sequence
21
+ from collections.abc import Sequence
22
22
 
23
23
  from nucliadb.models.internal.processing import PushPayload, PushTextFormat, Text
24
24
  from nucliadb_protos.resources_pb2 import (
@@ -25,12 +25,17 @@ from nucliadb.ingest.settings import Settings
25
25
 
26
26
 
27
27
  def assign_partitions(settings: Settings):
28
+ """
29
+ This function dynamically assigns the partitions to the current ingest sts
30
+ replica based on its hostname, typically (ingest-0, ingest-1, etc).
31
+ """
28
32
  # partitions start from 1, instead of 0
29
33
  all_partitions = [str(part + 1) for part in range(settings.nuclia_partitions)]
30
34
 
31
35
  # get replica number and total replicas from environment
32
36
  logger.info(f"PARTITIONS: Total Replicas = {settings.total_replicas}")
33
37
  if settings.replica_number == -1:
38
+ # Get replica number from hostname
34
39
  hostname = os.environ.get("HOSTNAME")
35
40
  if hostname is not None:
36
41
  sts_values = hostname.split("-")
@@ -39,10 +44,16 @@ def assign_partitions(settings: Settings):
39
44
  settings.replica_number = int(sts_values[-1])
40
45
  except Exception:
41
46
  logger.error(f"Could not extract replica number from hostname: {hostname}")
42
- pass
47
+ else:
48
+ logger.warning(f"Could not determine replica number from hostname: {hostname}")
49
+ else:
50
+ logger.warning(f"Could not determine replica number from hostname.")
43
51
 
44
52
  if settings.replica_number == -1:
45
53
  settings.replica_number = 0
54
+ else:
55
+ # We assume that replica numbers are set manually via env variables
56
+ pass
46
57
  logger.info(f"PARTITIONS: Replica Number = {settings.replica_number}")
47
58
 
48
59
  # calculate assigned partitions based on total replicas and own replica number
@@ -25,7 +25,7 @@ import uuid
25
25
  from collections import defaultdict
26
26
  from contextlib import AsyncExitStack
27
27
  from enum import Enum
28
- from typing import Any, Optional
28
+ from typing import Any
29
29
 
30
30
  import aiohttp
31
31
  import backoff
@@ -132,19 +132,19 @@ def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> Process
132
132
  class ProcessingEngine:
133
133
  def __init__(
134
134
  self,
135
- nuclia_service_account: Optional[str] = None,
136
- nuclia_zone: Optional[str] = None,
137
- nuclia_public_url: Optional[str] = None,
138
- nuclia_processing_cluster_url: Optional[str] = None,
139
- onprem: Optional[bool] = False,
140
- nuclia_jwt_key: Optional[str] = None,
135
+ nuclia_service_account: str | None = None,
136
+ nuclia_zone: str | None = None,
137
+ nuclia_public_url: str | None = None,
138
+ nuclia_processing_cluster_url: str | None = None,
139
+ onprem: bool | None = False,
140
+ nuclia_jwt_key: str | None = None,
141
141
  days_to_keep: int = 3,
142
142
  driver: FileBackendConfig = FileBackendConfig.GCS,
143
143
  ):
144
144
  self.nuclia_service_account = nuclia_service_account
145
145
  self.nuclia_zone = nuclia_zone
146
146
  if nuclia_public_url is not None:
147
- self.nuclia_public_url: Optional[str] = nuclia_public_url.format(zone=nuclia_zone)
147
+ self.nuclia_public_url: str | None = nuclia_public_url.format(zone=nuclia_zone)
148
148
  else:
149
149
  self.nuclia_public_url = None
150
150
 
@@ -196,7 +196,7 @@ class ProcessingEngine:
196
196
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
197
197
 
198
198
  def generate_file_token_from_fieldfile(
199
- self, file: FieldFilePB, classif_labels: Optional[list[ClassificationLabel]] = None
199
+ self, file: FieldFilePB, classif_labels: list[ClassificationLabel] | None = None
200
200
  ) -> str:
201
201
  if self.nuclia_jwt_key is None:
202
202
  raise AttributeError("Nuclia JWT key not set")
@@ -235,7 +235,7 @@ class ProcessingEngine:
235
235
  )
236
236
  @processing_observer.wrap({"type": "file_field_upload"})
237
237
  async def convert_filefield_to_str(
238
- self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
238
+ self, file: models.FileField, classif_labels: list[ClassificationLabel] | None = None
239
239
  ) -> str:
240
240
  # Upload file without storing on Nuclia DB
241
241
  headers = {}
@@ -273,7 +273,7 @@ class ProcessingEngine:
273
273
  ).decode()
274
274
 
275
275
  def convert_external_filefield_to_str(
276
- self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
276
+ self, file_field: models.FileField, classif_labels: list[ClassificationLabel] | None = None
277
277
  ) -> str:
278
278
  if self.nuclia_jwt_key is None:
279
279
  raise AttributeError("Nuclia JWT key not set")
@@ -313,7 +313,7 @@ class ProcessingEngine:
313
313
  self,
314
314
  file: FieldFilePB,
315
315
  storage: Storage,
316
- classif_labels: Optional[list[ClassificationLabel]] = None,
316
+ classif_labels: list[ClassificationLabel] | None = None,
317
317
  ) -> str:
318
318
  """It's already an internal file that needs to be uploaded"""
319
319
  if self.onprem is False:
@@ -438,7 +438,7 @@ class ProcessingEngine:
438
438
  queue=QueueType(queue_type) if queue_type is not None else None,
439
439
  )
440
440
 
441
- async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
441
+ async def delete_from_processing(self, *, kbid: str, resource_id: str | None = None) -> None:
442
442
  """
443
443
  Delete a resource from processing. This prevents inflight resources from being processed
444
444
  and wasting resources.
@@ -479,7 +479,7 @@ class DummyProcessingEngine(ProcessingEngine):
479
479
  pass
480
480
 
481
481
  async def convert_filefield_to_str(
482
- self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
482
+ self, file: models.FileField, classif_labels: list[ClassificationLabel] | None = None
483
483
  ) -> str:
484
484
  self.calls.append([file])
485
485
  index = len(self.values["convert_filefield_to_str"])
@@ -487,7 +487,7 @@ class DummyProcessingEngine(ProcessingEngine):
487
487
  return f"convert_filefield_to_str,{index}"
488
488
 
489
489
  def convert_external_filefield_to_str(
490
- self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
490
+ self, file_field: models.FileField, classif_labels: list[ClassificationLabel] | None = None
491
491
  ) -> str:
492
492
  self.calls.append([file_field])
493
493
  index = len(self.values["convert_external_filefield_to_str"])
@@ -498,7 +498,7 @@ class DummyProcessingEngine(ProcessingEngine):
498
498
  self,
499
499
  file: FieldFilePB,
500
500
  storage: Storage,
501
- classif_labels: Optional[list[ClassificationLabel]] = None,
501
+ classif_labels: list[ClassificationLabel] | None = None,
502
502
  ) -> str:
503
503
  self.calls.append([file, storage])
504
504
  index = len(self.values["convert_internal_filefield_to_str"])
@@ -516,5 +516,5 @@ class DummyProcessingEngine(ProcessingEngine):
516
516
  self.values["send_to_process"].append([item, partition])
517
517
  return ProcessingInfo(seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED)
518
518
 
519
- async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
519
+ async def delete_from_processing(self, *, kbid: str, resource_id: str | None = None) -> None:
520
520
  self.calls.append([kbid, resource_id])