nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -19,7 +19,6 @@
19
19
  #
20
20
  import asyncio
21
21
  import logging
22
- from typing import Optional
23
22
 
24
23
  import aiohttp.client_exceptions
25
24
  import nats.errors
@@ -28,6 +27,7 @@ from nidx_protos import noderesources_pb2, nodewriter_pb2
28
27
  from nidx_protos.noderesources_pb2 import Resource as PBBrainResource
29
28
 
30
29
  from nucliadb.common import datamanagers, locking
30
+ from nucliadb.common.catalog import catalog_delete, catalog_update
31
31
  from nucliadb.common.cluster.settings import settings as cluster_settings
32
32
  from nucliadb.common.cluster.utils import get_shard_manager
33
33
  from nucliadb.common.external_index_providers.base import ExternalIndexManager
@@ -61,8 +61,6 @@ from nucliadb_utils.cache.pubsub import PubSubDriver
61
61
  from nucliadb_utils.storages.storage import Storage
62
62
  from nucliadb_utils.utilities import get_storage, has_feature
63
63
 
64
- from .pgcatalog import pgcatalog_delete, pgcatalog_update
65
-
66
64
  logger = logging.getLogger("ingest-processor")
67
65
 
68
66
  MESSAGE_TO_NOTIFICATION_SOURCE = {
@@ -142,16 +140,13 @@ class Processor:
142
140
  and can not use the txn id
143
141
  """
144
142
 
145
- messages: dict[str, list[writer_pb2.BrokerMessage]]
146
-
147
143
  def __init__(
148
144
  self,
149
145
  driver: Driver,
150
146
  storage: Storage,
151
- pubsub: Optional[PubSubDriver] = None,
152
- partition: Optional[str] = None,
147
+ pubsub: PubSubDriver | None = None,
148
+ partition: str | None = None,
153
149
  ):
154
- self.messages = {}
155
150
  self.driver = driver
156
151
  self.storage = storage
157
152
  self.partition = partition
@@ -162,7 +157,7 @@ class Processor:
162
157
  self,
163
158
  message: writer_pb2.BrokerMessage,
164
159
  seqid: int,
165
- partition: Optional[str] = None,
160
+ partition: str | None = None,
166
161
  transaction_check: bool = True,
167
162
  ) -> None:
168
163
  partition = partition if self.partition is None else self.partition
@@ -180,18 +175,12 @@ class Processor:
180
175
  if message.type == writer_pb2.BrokerMessage.MessageType.DELETE:
181
176
  await self.delete_resource(message, seqid, partition, transaction_check)
182
177
  elif message.type == writer_pb2.BrokerMessage.MessageType.AUTOCOMMIT:
183
- await self.txn([message], seqid, partition, transaction_check)
184
- elif message.type == writer_pb2.BrokerMessage.MessageType.MULTI:
185
- # XXX Not supported right now
186
- # MULTI, COMMIT and ROLLBACK are all not supported in transactional mode right now
187
- # This concept is probably not tenable with current architecture because
188
- # of how nats works and how we would need to manage rollbacks.
189
- # XXX Should this be removed?
190
- await self.multi(message, seqid)
191
- elif message.type == writer_pb2.BrokerMessage.MessageType.COMMIT:
192
- await self.commit(message, seqid, partition)
193
- elif message.type == writer_pb2.BrokerMessage.MessageType.ROLLBACK:
194
- await self.rollback(message, seqid, partition)
178
+ await self.txn(message, seqid, partition, transaction_check)
179
+ else: # pragma: no cover
180
+ logger.error(
181
+ f"Unsupported message type: {message.type}",
182
+ extra={"seqid": seqid, "partition": partition},
183
+ )
195
184
 
196
185
  async def get_resource_uuid(self, kb: KnowledgeBox, message: writer_pb2.BrokerMessage) -> str:
197
186
  if message.uuid is None:
@@ -227,7 +216,8 @@ class Processor:
227
216
  shard = await kb.get_resource_shard(shard_id)
228
217
  if shard is None:
229
218
  raise AttributeError("Shard not available")
230
- await pgcatalog_delete(txn, message.kbid, uuid)
219
+
220
+ await catalog_delete(txn, message.kbid, uuid)
231
221
  external_index_manager = await get_external_index_manager(kbid=message.kbid)
232
222
  if external_index_manager is not None:
233
223
  await self.external_index_delete_resource(external_index_manager, uuid)
@@ -242,7 +232,6 @@ class Processor:
242
232
  await self.notify_abort(
243
233
  partition=partition,
244
234
  seqid=seqid,
245
- multi=message.multiid,
246
235
  kbid=message.kbid,
247
236
  rid=message.uuid,
248
237
  source=message.source,
@@ -256,7 +245,6 @@ class Processor:
256
245
  await self.notify_commit(
257
246
  partition=partition,
258
247
  seqid=seqid,
259
- multi=message.multiid,
260
248
  message=message,
261
249
  write_type=writer_pb2.Notification.WriteType.DELETED,
262
250
  )
@@ -277,15 +265,12 @@ class Processor:
277
265
  @processor_observer.wrap({"type": "txn"})
278
266
  async def txn(
279
267
  self,
280
- messages: list[writer_pb2.BrokerMessage],
268
+ message: writer_pb2.BrokerMessage,
281
269
  seqid: int,
282
270
  partition: str,
283
271
  transaction_check: bool = True,
284
272
  ) -> None:
285
- if len(messages) == 0:
286
- return None
287
-
288
- kbid = messages[0].kbid
273
+ kbid = message.kbid
289
274
  if not await datamanagers.atomic.kb.exists_kb(kbid=kbid):
290
275
  logger.info(f"KB {kbid} is deleted: skiping txn")
291
276
  if transaction_check:
@@ -296,58 +281,55 @@ class Processor:
296
281
 
297
282
  async with self.driver.rw_transaction() as txn:
298
283
  try:
299
- multi = messages[0].multiid
300
284
  kb = KnowledgeBox(txn, self.storage, kbid)
301
- uuid = await self.get_resource_uuid(kb, messages[0])
302
- resource: Optional[Resource] = None
285
+ uuid = await self.get_resource_uuid(kb, message)
286
+
287
+ resource: Resource | None = None
303
288
  handled_exception = None
304
289
  created = False
305
290
 
306
- for message in messages:
307
- if resource is not None:
308
- assert resource.uuid == message.uuid
309
-
310
- if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
311
- resource = await kb.get(uuid)
312
- if resource is None:
313
- # It's a new resource
314
- resource = await kb.add_resource(uuid, message.slug, message.basic)
315
- created = True
316
- else:
317
- # It's an update from writer for an existing resource
318
- ...
319
-
320
- elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
321
- resource = await kb.get(uuid)
322
- if resource is None:
323
- logger.info(
324
- f"Secondary message for resource {message.uuid} and resource does not exist, ignoring"
325
- )
326
- continue
327
- else:
328
- # It's an update from processor for an existing resource
329
- ...
330
-
331
- generated_fields = await get_generated_fields(message, resource)
332
- if generated_fields.is_not_empty():
333
- await send_generated_fields_to_process(
334
- kbid, resource, generated_fields, message
335
- )
336
- # TODO: remove this when processor sends the field set
337
- for generated_text in generated_fields.texts:
338
- message.texts[
339
- generated_text
340
- ].generated_by.data_augmentation.SetInParent()
341
-
291
+ if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
292
+ resource = await kb.get(uuid)
293
+ if resource is None:
294
+ # It's a new resource
295
+ resource = await kb.add_resource(uuid, message.slug, message.basic)
296
+ created = True
342
297
  else:
343
- raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
298
+ # It's an update from writer for an existing resource
299
+ ...
300
+
301
+ elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
302
+ resource = await kb.get(uuid)
303
+ if resource is None:
304
+ logger.info(
305
+ f"Processor message for resource received but the resource does not exist, ignoring.",
306
+ extra={
307
+ "kbid": kbid,
308
+ "rid": uuid,
309
+ "seqid": seqid,
310
+ },
311
+ )
312
+ return None
313
+ else:
314
+ # It's an update from processor for an existing resource
315
+ ...
316
+
317
+ generated_fields = await get_generated_fields(message, resource)
318
+ if generated_fields.is_not_empty():
319
+ await send_generated_fields_to_process(kbid, resource, generated_fields, message)
320
+ # TODO: remove this when processor sends the field set
321
+ for generated_text in generated_fields.texts:
322
+ message.texts[generated_text].generated_by.data_augmentation.SetInParent()
323
+
324
+ else: # pragma: no cover
325
+ raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
344
326
 
345
- # apply changes from the broker message to the resource
346
- await self.apply_resource(message, resource, update=(not created))
327
+ # apply changes from the broker message to the resource
328
+ await self.apply_resource(message, resource, update=(not created))
347
329
 
348
330
  # index message
349
331
  if resource and resource.modified:
350
- index_message = await self.generate_index_message(resource, messages, created)
332
+ index_message = await self.generate_index_message(resource, message, created)
351
333
  try:
352
334
  warnings = await self.index_resource(
353
335
  index_message=index_message,
@@ -357,7 +339,7 @@ class Processor:
357
339
  seqid=seqid,
358
340
  partition=partition,
359
341
  kb=kb,
360
- source=messages_source(messages),
342
+ source=to_index_message_source(message),
361
343
  )
362
344
  # Save indexing warnings
363
345
  for field_id, warning in warnings:
@@ -374,8 +356,7 @@ class Processor:
374
356
  index_message.labels.remove(current_status[0])
375
357
  index_message.labels.append("/n/s/ERROR")
376
358
 
377
- await pgcatalog_update(txn, kbid, resource, index_message)
378
-
359
+ await catalog_update(txn, kbid, resource, index_message)
379
360
  if transaction_check:
380
361
  await sequence_manager.set_last_seqid(txn, partition, seqid)
381
362
  await txn.commit()
@@ -386,7 +367,6 @@ class Processor:
386
367
  await self.notify_commit(
387
368
  partition=partition,
388
369
  seqid=seqid,
389
- multi=multi,
390
370
  message=message,
391
371
  write_type=(
392
372
  writer_pb2.Notification.WriteType.CREATED
@@ -399,7 +379,6 @@ class Processor:
399
379
  await self.notify_abort(
400
380
  partition=partition,
401
381
  seqid=seqid,
402
- multi=multi,
403
382
  kbid=kbid,
404
383
  rid=uuid,
405
384
  source=message.source,
@@ -419,7 +398,6 @@ class Processor:
419
398
  await self.notify_abort(
420
399
  partition=partition,
421
400
  seqid=seqid,
422
- multi=multi,
423
401
  kbid=kbid,
424
402
  rid=uuid,
425
403
  source=message.source,
@@ -429,11 +407,10 @@ class Processor:
429
407
  # As we are in the middle of a transaction, we cannot let the exception raise directly
430
408
  # as we need to do some cleanup. The exception will be reraised at the end of the function
431
409
  # and then handled by the top caller, so errors can be handled in the same place.
432
- await self.deadletter(messages, partition, seqid)
410
+ await self.deadletter(message, partition, seqid)
433
411
  await self.notify_abort(
434
412
  partition=partition,
435
413
  seqid=seqid,
436
- multi=multi,
437
414
  kbid=kbid,
438
415
  rid=uuid,
439
416
  source=message.source,
@@ -468,22 +445,27 @@ class Processor:
468
445
  # a resource was move to another shard while it was being indexed
469
446
  shard_id = await datamanagers.resources.get_resource_shard_id(txn, kbid=kbid, rid=uuid)
470
447
 
471
- shard = None
472
- if shard_id is not None:
473
- # Resource already has a shard assigned
474
- shard = await kb.get_resource_shard(shard_id)
475
- if shard is None:
476
- raise AttributeError("Shard not available")
477
- else:
478
- # It's a new resource, get KB's current active shard to place new resource on
479
- shard = await self.index_node_shard_manager.get_current_active_shard(txn, kbid)
480
- if shard is None:
481
- # No current shard available, create a new one
482
- shard = await self.index_node_shard_manager.create_shard_by_kbid(txn, kbid)
483
- await datamanagers.resources.set_resource_shard_id(
484
- txn, kbid=kbid, rid=uuid, shard=shard.shard
485
- )
486
- return shard
448
+ shard = None
449
+ if shard_id is not None:
450
+ # Resource already has a shard assigned
451
+ shard = await kb.get_resource_shard(shard_id)
452
+ if shard is None:
453
+ raise AttributeError("Shard not available")
454
+ else:
455
+ # It's a new resource, get KB's current active shard to place new resource on
456
+ shard = await self.index_node_shard_manager.get_current_active_shard(txn, kbid)
457
+ if shard is None:
458
+ # No current shard available, create a new one
459
+ async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
460
+ kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
461
+ prewarm = kb_config is not None and kb_config.prewarm_enabled
462
+ shard = await self.index_node_shard_manager.create_shard_by_kbid(
463
+ txn, kbid, prewarm_enabled=prewarm
464
+ )
465
+ await datamanagers.resources.set_resource_shard_id(
466
+ txn, kbid=kbid, rid=uuid, shard=shard.shard
467
+ )
468
+ return shard
487
469
 
488
470
  @processor_observer.wrap({"type": "index_resource"})
489
471
  async def index_resource(
@@ -519,17 +501,16 @@ class Processor:
519
501
  async def generate_index_message(
520
502
  self,
521
503
  resource: Resource,
522
- messages: list[writer_pb2.BrokerMessage],
504
+ message: writer_pb2.BrokerMessage,
523
505
  resource_created: bool,
524
506
  ) -> PBBrainResource:
525
507
  builder = IndexMessageBuilder(resource)
526
- message_source = messages_source(messages)
527
- if message_source == nodewriter_pb2.IndexMessageSource.WRITER:
528
- return await builder.for_writer_bm(messages, resource_created)
529
- elif message_source == nodewriter_pb2.IndexMessageSource.PROCESSOR:
530
- return await builder.for_processor_bm(messages)
508
+ if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
509
+ return await builder.for_writer_bm(message, resource_created)
510
+ elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
511
+ return await builder.for_processor_bm(message)
531
512
  else: # pragma: no cover
532
- raise InvalidBrokerMessage(f"Unknown broker message source: {message_source}")
513
+ raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
533
514
 
534
515
  async def external_index_delete_resource(
535
516
  self, external_index_manager: ExternalIndexManager, resource_uuid: str
@@ -582,35 +563,8 @@ class Processor:
582
563
  resource_uuid=resource_uuid, resource_data=index_message
583
564
  )
584
565
 
585
- async def multi(self, message: writer_pb2.BrokerMessage, seqid: int) -> None:
586
- self.messages.setdefault(message.multiid, []).append(message)
587
-
588
- async def commit(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
589
- if message.multiid not in self.messages:
590
- # Error
591
- logger.error(f"Closed multi {message.multiid}")
592
- await self.deadletter([message], partition, seqid)
593
- else:
594
- await self.txn(self.messages[message.multiid], seqid, partition)
595
-
596
- async def rollback(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
597
- # Error
598
- logger.error(f"Closed multi {message.multiid}")
599
- del self.messages[message.multiid]
600
- await self.notify_abort(
601
- partition=partition,
602
- seqid=seqid,
603
- multi=message.multiid,
604
- kbid=message.kbid,
605
- rid=message.uuid,
606
- source=message.source,
607
- )
608
-
609
- async def deadletter(
610
- self, messages: list[writer_pb2.BrokerMessage], partition: str, seqid: int
611
- ) -> None:
612
- for seq, message in enumerate(messages):
613
- await self.storage.deadletter(message, seq, seqid, partition)
566
+ async def deadletter(self, message: writer_pb2.BrokerMessage, partition: str, seqid: int) -> None:
567
+ await self.storage.deadletter(message, 0, seqid, partition)
614
568
 
615
569
  @processor_observer.wrap({"type": "apply_resource"})
616
570
  async def apply_resource(
@@ -670,7 +624,6 @@ class Processor:
670
624
  *,
671
625
  partition: str,
672
626
  seqid: int,
673
- multi: str,
674
627
  message: writer_pb2.BrokerMessage,
675
628
  write_type: writer_pb2.Notification.WriteType.ValueType,
676
629
  ):
@@ -678,7 +631,7 @@ class Processor:
678
631
  notification = writer_pb2.Notification(
679
632
  partition=int(partition),
680
633
  seqid=seqid,
681
- multi=multi,
634
+ multi="",
682
635
  uuid=message.uuid,
683
636
  kbid=message.kbid,
684
637
  action=writer_pb2.Notification.Action.COMMIT,
@@ -698,7 +651,6 @@ class Processor:
698
651
  *,
699
652
  partition: str,
700
653
  seqid: int,
701
- multi: str,
702
654
  kbid: str,
703
655
  rid: str,
704
656
  source: writer_pb2.BrokerMessage.MessageSource.ValueType,
@@ -706,7 +658,7 @@ class Processor:
706
658
  message = writer_pb2.Notification(
707
659
  partition=int(partition),
708
660
  seqid=seqid,
709
- multi=multi,
661
+ multi="",
710
662
  uuid=rid,
711
663
  kbid=kbid,
712
664
  action=writer_pb2.Notification.ABORT,
@@ -722,7 +674,7 @@ class Processor:
722
674
  await self.pubsub.publish(channel, payload)
723
675
 
724
676
  async def _mark_resource_error(
725
- self, kb: KnowledgeBox, resource: Optional[Resource], partition: str, seqid: int
677
+ self, kb: KnowledgeBox, resource: Resource | None, partition: str, seqid: int
726
678
  ) -> None:
727
679
  """
728
680
  Unhandled error processing, try to mark resource as error
@@ -743,8 +695,8 @@ class Processor:
743
695
  # XXX: Why are these utility functions here?
744
696
  async def get_kb_obj(
745
697
  self, txn: Transaction, kbid: knowledgebox_pb2.KnowledgeBoxID
746
- ) -> Optional[KnowledgeBox]:
747
- uuid: Optional[str] = kbid.uuid
698
+ ) -> KnowledgeBox | None:
699
+ uuid: str | None = kbid.uuid
748
700
  if uuid == "":
749
701
  uuid = await datamanagers.kb.get_kb_uuid(txn, slug=kbid.slug)
750
702
 
@@ -759,23 +711,16 @@ class Processor:
759
711
  return kbobj
760
712
 
761
713
 
762
- def messages_source(messages: list[writer_pb2.BrokerMessage]):
763
- from_writer = all(
764
- (message.source == writer_pb2.BrokerMessage.MessageSource.WRITER for message in messages)
765
- )
766
- from_processor = all(
767
- (message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR for message in messages)
768
- )
769
- if from_writer:
770
- source = nodewriter_pb2.IndexMessageSource.WRITER
771
- elif from_processor:
772
- source = nodewriter_pb2.IndexMessageSource.PROCESSOR
714
+ def to_index_message_source(message: writer_pb2.BrokerMessage):
715
+ if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
716
+ return nodewriter_pb2.IndexMessageSource.WRITER
717
+ elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
718
+ return nodewriter_pb2.IndexMessageSource.PROCESSOR
773
719
  else: # pragma: no cover
774
- msg = "Processor received multiple broker messages with different sources in the same txn!"
720
+ msg = f"Processor received a broker message with unexpected source! {message.source}"
775
721
  logger.error(msg)
776
722
  errors.capture_exception(Exception(msg))
777
- source = nodewriter_pb2.IndexMessageSource.PROCESSOR
778
- return source
723
+ return nodewriter_pb2.IndexMessageSource.PROCESSOR
779
724
 
780
725
 
781
726
  def has_vectors_operation(index_message: PBBrainResource) -> bool:
@@ -17,14 +17,13 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  from nucliadb.common.maindb.driver import Driver, Transaction
23
22
 
24
23
  TXNID = "/internal/worker/{worker}"
25
24
 
26
25
 
27
- async def get_last_seqid(driver: Driver, worker: str) -> Optional[int]:
26
+ async def get_last_seqid(driver: Driver, worker: str) -> int | None:
28
27
  """
29
28
  Get last stored sequence id for a worker.
30
29