nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show
  1. migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
  2. migrations/0017_multiple_writable_shards.py +1 -1
  3. migrations/0018_purge_orphan_kbslugs.py +1 -1
  4. migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
  5. migrations/0021_overwrite_vectorsets_key.py +1 -1
  6. migrations/0023_backfill_pg_catalog.py +7 -3
  7. migrations/0025_assign_models_to_kbs_v2.py +3 -3
  8. migrations/0027_rollover_texts3.py +1 -1
  9. migrations/0028_extracted_vectors_reference.py +1 -1
  10. migrations/0029_backfill_field_status.py +1 -1
  11. migrations/0032_remove_old_relations.py +1 -1
  12. migrations/0036_backfill_catalog_slug.py +1 -1
  13. migrations/0037_backfill_catalog_facets.py +1 -1
  14. migrations/0038_backfill_catalog_field_labels.py +7 -3
  15. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  16. migrations/0040_migrate_search_configurations.py +79 -0
  17. migrations/pg/0010_shards_index.py +34 -0
  18. nucliadb/backups/create.py +3 -3
  19. nucliadb/backups/restore.py +3 -3
  20. nucliadb/common/cache.py +1 -1
  21. nucliadb/common/catalog/__init__.py +79 -0
  22. nucliadb/common/catalog/dummy.py +36 -0
  23. nucliadb/common/catalog/interface.py +85 -0
  24. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  25. nucliadb/common/catalog/utils.py +56 -0
  26. nucliadb/common/cluster/manager.py +3 -19
  27. nucliadb/common/cluster/rebalance.py +484 -110
  28. nucliadb/common/cluster/rollover.py +29 -0
  29. nucliadb/common/cluster/settings.py +1 -1
  30. nucliadb/common/cluster/utils.py +26 -0
  31. nucliadb/common/datamanagers/atomic.py +6 -0
  32. nucliadb/common/datamanagers/utils.py +2 -2
  33. nucliadb/common/external_index_providers/manager.py +1 -29
  34. nucliadb/common/external_index_providers/settings.py +1 -27
  35. nucliadb/common/filter_expression.py +16 -33
  36. nucliadb/common/http_clients/exceptions.py +8 -0
  37. nucliadb/common/http_clients/processing.py +4 -0
  38. nucliadb/common/http_clients/utils.py +3 -0
  39. nucliadb/common/ids.py +77 -55
  40. nucliadb/common/locking.py +4 -4
  41. nucliadb/common/maindb/driver.py +11 -1
  42. nucliadb/common/maindb/local.py +1 -1
  43. nucliadb/common/maindb/pg.py +1 -1
  44. nucliadb/common/nidx.py +19 -1
  45. nucliadb/common/vector_index_config.py +1 -1
  46. nucliadb/export_import/datamanager.py +3 -3
  47. nucliadb/ingest/consumer/pull.py +7 -0
  48. nucliadb/ingest/consumer/service.py +2 -27
  49. nucliadb/ingest/consumer/shard_creator.py +17 -6
  50. nucliadb/ingest/fields/base.py +9 -17
  51. nucliadb/ingest/fields/conversation.py +47 -1
  52. nucliadb/ingest/orm/brain_v2.py +21 -3
  53. nucliadb/ingest/orm/index_message.py +126 -111
  54. nucliadb/ingest/orm/knowledgebox.py +84 -43
  55. nucliadb/ingest/orm/processor/auditing.py +1 -1
  56. nucliadb/ingest/orm/processor/processor.py +95 -149
  57. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  58. nucliadb/ingest/orm/resource.py +10 -1
  59. nucliadb/ingest/partitions.py +12 -1
  60. nucliadb/ingest/serialize.py +2 -2
  61. nucliadb/ingest/service/writer.py +26 -19
  62. nucliadb/ingest/settings.py +33 -11
  63. nucliadb/learning_proxy.py +12 -15
  64. nucliadb/metrics_exporter.py +17 -4
  65. nucliadb/migrator/datamanager.py +11 -17
  66. nucliadb/migrator/migrator.py +2 -2
  67. nucliadb/purge/__init__.py +12 -17
  68. nucliadb/purge/orphan_shards.py +2 -2
  69. nucliadb/reader/api/v1/knowledgebox.py +40 -12
  70. nucliadb/reader/api/v1/learning_config.py +30 -10
  71. nucliadb/reader/api/v1/resource.py +2 -2
  72. nucliadb/reader/api/v1/services.py +1 -1
  73. nucliadb/reader/reader/notifications.py +1 -1
  74. nucliadb/search/api/v1/__init__.py +1 -0
  75. nucliadb/search/api/v1/catalog.py +4 -4
  76. nucliadb/search/api/v1/find.py +1 -4
  77. nucliadb/search/api/v1/hydrate.py +328 -0
  78. nucliadb/search/api/v1/resource/ask.py +21 -1
  79. nucliadb/search/api/v1/search.py +1 -4
  80. nucliadb/search/predict.py +9 -2
  81. nucliadb/search/search/cache.py +1 -20
  82. nucliadb/search/search/chat/ask.py +50 -8
  83. nucliadb/search/search/chat/prompt.py +47 -15
  84. nucliadb/search/search/chat/query.py +8 -1
  85. nucliadb/search/search/fetch.py +1 -1
  86. nucliadb/search/search/find.py +1 -6
  87. nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
  88. nucliadb/search/search/hydrator/fields.py +175 -0
  89. nucliadb/search/search/hydrator/images.py +130 -0
  90. nucliadb/search/search/hydrator/paragraphs.py +307 -0
  91. nucliadb/search/search/hydrator/resources.py +56 -0
  92. nucliadb/search/search/metrics.py +16 -0
  93. nucliadb/search/search/predict_proxy.py +33 -11
  94. nucliadb/search/search/query.py +0 -23
  95. nucliadb/search/search/query_parser/fetcher.py +5 -5
  96. nucliadb/search/search/query_parser/models.py +1 -30
  97. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  98. nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
  99. nucliadb/search/search/query_parser/parsers/common.py +16 -7
  100. nucliadb/search/search/query_parser/parsers/find.py +0 -11
  101. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  102. nucliadb/search/search/query_parser/parsers/search.py +0 -11
  103. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
  104. nucliadb/search/search/rerankers.py +1 -1
  105. nucliadb/search/search/summarize.py +1 -1
  106. nucliadb/standalone/run.py +3 -0
  107. nucliadb/tasks/retries.py +4 -4
  108. nucliadb/train/generators/sentence_classifier.py +2 -8
  109. nucliadb/train/generators/utils.py +1 -1
  110. nucliadb/train/nodes.py +4 -4
  111. nucliadb/train/servicer.py +1 -1
  112. nucliadb/train/uploader.py +1 -1
  113. nucliadb/writer/api/v1/field.py +14 -9
  114. nucliadb/writer/api/v1/knowledgebox.py +15 -52
  115. nucliadb/writer/api/v1/learning_config.py +5 -4
  116. nucliadb/writer/api/v1/resource.py +2 -2
  117. nucliadb/writer/resource/field.py +38 -2
  118. nucliadb/writer/tus/azure.py +4 -4
  119. nucliadb/writer/tus/gcs.py +11 -17
  120. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
  121. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
  122. nucliadb/common/external_index_providers/pinecone.py +0 -894
  123. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  124. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
  125. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
  126. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
@@ -126,6 +126,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
126
126
  external_index_provider=request.external_index_provider,
127
127
  hidden_resources_enabled=request.hidden_resources_enabled,
128
128
  hidden_resources_hide_on_creation=request.hidden_resources_hide_on_creation,
129
+ prewarm_enabled=request.prewarm_enabled,
129
130
  )
130
131
 
131
132
  except KnowledgeBoxConflict:
@@ -167,11 +168,17 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
167
168
  )
168
169
 
169
170
  try:
170
- async with self.driver.transaction() as txn:
171
- kbid = await KnowledgeBoxORM.update(
172
- txn, uuid=request.uuid, slug=request.slug, config=request.config
173
- )
174
- await txn.commit()
171
+ kbid = await KnowledgeBoxORM.update(
172
+ self.driver,
173
+ kbid=request.uuid,
174
+ slug=request.slug,
175
+ title=request.config.title or None,
176
+ description=request.config.description or None,
177
+ external_index_provider=request.config.external_index_provider or None,
178
+ hidden_resources_enabled=request.config.hidden_resources_enabled,
179
+ hidden_resources_hide_on_creation=request.config.hidden_resources_hide_on_creation,
180
+ prewarm_enabled=request.config.prewarm_enabled,
181
+ )
175
182
  except KnowledgeBoxNotFound:
176
183
  return UpdateKnowledgeBoxResponse(status=KnowledgeBoxResponseStatus.NOTFOUND)
177
184
  except Exception:
@@ -221,13 +228,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
221
228
  self, request: NewEntitiesGroupRequest, context=None
222
229
  ) -> NewEntitiesGroupResponse:
223
230
  response = NewEntitiesGroupResponse()
224
- async with self.driver.transaction(read_only=True) as ro_txn:
231
+ async with self.driver.ro_transaction() as ro_txn:
225
232
  kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
226
233
  if kbobj is None:
227
234
  response.status = NewEntitiesGroupResponse.Status.KB_NOT_FOUND
228
235
  return response
229
236
 
230
- async with self.driver.transaction() as txn:
237
+ async with self.driver.rw_transaction() as txn:
231
238
  kbobj.txn = txn
232
239
  entities_manager = EntitiesManager(kbobj, txn)
233
240
  try:
@@ -244,7 +251,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
244
251
  self, request: GetEntitiesRequest, context=None
245
252
  ) -> GetEntitiesResponse:
246
253
  response = GetEntitiesResponse()
247
- async with self.driver.transaction(read_only=True) as txn:
254
+ async with self.driver.ro_transaction() as txn:
248
255
  kbobj = await self.proc.get_kb_obj(txn, request.kb)
249
256
  if kbobj is None:
250
257
  response.status = GetEntitiesResponse.Status.NOTFOUND
@@ -266,7 +273,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
266
273
  self, request: ListEntitiesGroupsRequest, context=None
267
274
  ) -> ListEntitiesGroupsResponse:
268
275
  response = ListEntitiesGroupsResponse()
269
- async with self.driver.transaction(read_only=True) as txn:
276
+ async with self.driver.ro_transaction() as txn:
270
277
  kbobj = await self.proc.get_kb_obj(txn, request.kb)
271
278
  if kbobj is None:
272
279
  response.status = ListEntitiesGroupsResponse.Status.NOTFOUND
@@ -290,7 +297,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
290
297
  self, request: GetEntitiesGroupRequest, context=None
291
298
  ) -> GetEntitiesGroupResponse:
292
299
  response = GetEntitiesGroupResponse()
293
- async with self.driver.transaction(read_only=True) as txn:
300
+ async with self.driver.ro_transaction() as txn:
294
301
  kbobj = await self.proc.get_kb_obj(txn, request.kb)
295
302
  if kbobj is None:
296
303
  response.status = GetEntitiesGroupResponse.Status.KB_NOT_FOUND
@@ -315,13 +322,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
315
322
 
316
323
  async def SetEntities(self, request: SetEntitiesRequest, context=None) -> OpStatusWriter: # type: ignore
317
324
  response = OpStatusWriter()
318
- async with self.driver.transaction(read_only=True) as ro_txn:
325
+ async with self.driver.ro_transaction() as ro_txn:
319
326
  kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
320
327
  if kbobj is None:
321
328
  response.status = OpStatusWriter.Status.NOTFOUND
322
329
  return response
323
330
 
324
- async with self.driver.transaction() as txn:
331
+ async with self.driver.rw_transaction() as txn:
325
332
  kbobj.txn = txn
326
333
  entities_manager = EntitiesManager(kbobj, txn)
327
334
  try:
@@ -339,13 +346,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
339
346
  self, request: UpdateEntitiesGroupRequest, context=None
340
347
  ) -> UpdateEntitiesGroupResponse:
341
348
  response = UpdateEntitiesGroupResponse()
342
- async with self.driver.transaction(read_only=True) as ro_txn:
349
+ async with self.driver.ro_transaction() as ro_txn:
343
350
  kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
344
351
  if kbobj is None:
345
352
  response.status = UpdateEntitiesGroupResponse.Status.KB_NOT_FOUND
346
353
  return response
347
354
 
348
- async with self.driver.transaction() as txn:
355
+ async with self.driver.rw_transaction() as txn:
349
356
  kbobj.txn = txn
350
357
  entities_manager = EntitiesManager(kbobj, txn)
351
358
  try:
@@ -368,13 +375,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
368
375
  async def DelEntities(self, request: DelEntitiesRequest, context=None) -> OpStatusWriter: # type: ignore
369
376
  response = OpStatusWriter()
370
377
 
371
- async with self.driver.transaction(read_only=True) as ro_txn:
378
+ async with self.driver.ro_transaction() as ro_txn:
372
379
  kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
373
380
  if kbobj is None:
374
381
  response.status = OpStatusWriter.Status.NOTFOUND
375
382
  return response
376
383
 
377
- async with self.driver.transaction() as txn:
384
+ async with self.driver.rw_transaction() as txn:
378
385
  kbobj.txn = txn
379
386
  entities_manager = EntitiesManager(kbobj, txn)
380
387
  try:
@@ -393,7 +400,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
393
400
  ) -> WriterStatusResponse:
394
401
  logger.info("Status Call")
395
402
  response = WriterStatusResponse()
396
- async with self.driver.transaction(read_only=True) as txn:
403
+ async with self.driver.ro_transaction() as txn:
397
404
  async for _, slug in datamanagers.kb.get_kbs(txn):
398
405
  response.knowledgeboxes.append(slug)
399
406
 
@@ -405,7 +412,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
405
412
  return response
406
413
 
407
414
  async def Index(self, request: IndexResource, context=None) -> IndexStatus: # type: ignore
408
- async with self.driver.transaction() as txn:
415
+ async with self.driver.ro_transaction() as txn:
409
416
  kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
410
417
  resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
411
418
  bm = await generate_broker_message(resobj)
@@ -419,7 +426,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
419
426
 
420
427
  async def ReIndex(self, request: IndexResource, context=None) -> IndexStatus: # type: ignore
421
428
  try:
422
- async with self.driver.transaction() as txn:
429
+ async with self.driver.rw_transaction() as txn:
423
430
  kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
424
431
  resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
425
432
  resobj.disable_vectors = not request.reindex_vectors
@@ -67,6 +67,11 @@ class DriverSettings(BaseSettings):
67
67
  )
68
68
 
69
69
 
70
+ class CatalogConfig(Enum):
71
+ UNSET = "unset"
72
+ PG = "pg"
73
+
74
+
70
75
  # For use during migration from pull v1 to pull v2
71
76
  class ProcessingPullMode(Enum):
72
77
  OFF = "off"
@@ -75,26 +80,43 @@ class ProcessingPullMode(Enum):
75
80
 
76
81
 
77
82
  class Settings(DriverSettings):
78
- grpc_port: int = 8030
79
-
80
- partitions: list[str] = ["1"]
83
+ # Catalog settings
84
+ catalog: CatalogConfig = Field(default=CatalogConfig.PG, description="Catalog backend")
81
85
 
86
+ # Pull worker settings
82
87
  pull_time_error_backoff: int = 30
83
88
  pull_api_timeout: int = 60
84
- disable_pull_worker: bool = False
89
+ disable_pull_worker: bool = Field(
90
+ default=False, description="Set to true to disable the pull worker task"
91
+ )
85
92
 
86
- # ingest consumer sts replica settings
87
- replica_number: int = -1
88
- total_replicas: int = 1 # number of ingest processor replicas in the cluster
89
- nuclia_partitions: int = 50
93
+ # Ingest consumer sts replica settings
94
+ replica_number: int = Field(
95
+ default=-1,
96
+ description="The replica number of this ingest statefulset instance. Leave to -1 to auto-assign based on hostname.",
97
+ )
98
+ total_replicas: int = Field(default=1, description="Number of ingest statefulset replicas deployed")
99
+ nuclia_partitions: int = Field(
100
+ default=50, description="Total number of partitions of the nats stream."
101
+ )
102
+ partitions: list[str] = Field(
103
+ default=["1"],
104
+ description="List of partitions assigned to this ingest statefulset instance. This is automatically assigned based on the replica number and total replicas.",
105
+ )
106
+ max_concurrent_ingest_processing: int = Field(
107
+ default=5,
108
+ description="Controls the number of concurrent messages from different partitions that can be processed at the same time by ingest statefulset consumers.",
109
+ )
90
110
 
91
- max_receive_message_length: int = 500 # In MB
111
+ # Grpc server settings
112
+ grpc_port: int = 8030
113
+ max_receive_message_length: int = Field(
114
+ default=500, description="Maximum receive grpc message length in MB."
115
+ )
92
116
 
93
117
  # Search query timeouts
94
118
  relation_search_timeout: float = 10.0
95
119
  relation_types_timeout: float = 10.0
96
120
 
97
- max_concurrent_ingest_processing: int = 5
98
-
99
121
 
100
122
  settings = Settings()
@@ -43,8 +43,8 @@ logger = logging.getLogger(SERVICE_NAME)
43
43
  WHITELISTED_HEADERS = {
44
44
  "x-nucliadb-user",
45
45
  "x-nucliadb-roles",
46
- "x-stf-roles",
47
- "x-stf-user",
46
+ "x-nucliadb-account",
47
+ "x-nucliadb-account-type",
48
48
  "x-forwarded-for",
49
49
  "x-forwarded-host",
50
50
  "x-forwarded-proto",
@@ -204,14 +204,14 @@ async def learning_config_proxy(
204
204
  request: Request,
205
205
  method: str,
206
206
  url: str,
207
- extra_headers: Optional[dict[str, str]] = None,
207
+ headers: dict[str, str] = {},
208
208
  ) -> Union[Response, StreamingResponse]:
209
209
  return await proxy(
210
210
  service=LearningService.CONFIG,
211
211
  request=request,
212
212
  method=method,
213
213
  url=url,
214
- extra_headers=extra_headers,
214
+ headers=headers,
215
215
  )
216
216
 
217
217
 
@@ -244,24 +244,21 @@ async def proxy(
244
244
  request: Request,
245
245
  method: str,
246
246
  url: str,
247
- extra_headers: Optional[dict[str, str]] = None,
247
+ headers: dict[str, str] = {},
248
248
  ) -> Union[Response, StreamingResponse]:
249
249
  """
250
250
  Proxy the request to a learning API.
251
251
 
252
- service: LearningService. The learning service to proxy the request to.
253
- request: Request. The incoming request.
254
- method: str. The HTTP method to use.
255
- url: str. The URL to proxy the request to.
256
- extra_headers: Optional[dict[str, str]]. Extra headers to include in the proxied request.
252
+ service: The learning service to proxy the request to.
253
+ request: The incoming request.
254
+ method: The HTTP method to use.
255
+ url: The URL to proxy the request to.
256
+ headers: Extra headers to include in the proxied request.
257
257
 
258
258
  Returns: Response. The response from the learning API. If the response is chunked, a StreamingResponse is returned.
259
259
  """
260
-
261
- proxied_headers = extra_headers or {}
262
- proxied_headers.update(
263
- {k.lower(): v for k, v in request.headers.items() if is_white_listed_header(k)}
264
- )
260
+ proxied_headers = {k.lower(): v for k, v in request.headers.items() if is_white_listed_header(k)}
261
+ proxied_headers.update(**headers)
265
262
 
266
263
  async with service_client(
267
264
  base_url=get_base_url(service=service),
@@ -37,19 +37,24 @@ MIGRATION_COUNT = metrics.Gauge("nucliadb_migration", labels={"type": "", "versi
37
37
 
38
38
  PENDING_RESOURCE_COUNT = metrics.Gauge("nucliadb_pending_resources_count")
39
39
 
40
+ KBS_COUNT = metrics.Gauge("nucliadb_kbs_count")
41
+ RESOURCES_COUNT = metrics.Gauge("nucliadb_resources_count")
42
+
40
43
 
41
44
  async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str, None]:
42
45
  """
43
46
  Return a list of all KB ids.
44
47
  """
45
- async with context.kv_driver.transaction(read_only=True) as txn:
48
+ async with context.kv_driver.ro_transaction() as txn:
46
49
  async for kbid, _ in datamanagers.kb.get_kbs(txn):
47
50
  yield kbid
48
51
 
49
52
 
50
- async def update_migration_metrics(context: ApplicationContext):
53
+ async def update_kb_metrics(context: ApplicationContext):
51
54
  """
52
- Report the global migration version and the number of KBs per migration version.
55
+ Report metrics at the kb level:
56
+ - total number of KBs
57
+ - the global migration version and the number of KBs per migration version.
53
58
  """
54
59
  # Clear previoulsy set values so that we report only the current state
55
60
  MIGRATION_COUNT.gauge.clear()
@@ -60,12 +65,16 @@ async def update_migration_metrics(context: ApplicationContext):
60
65
  MIGRATION_COUNT.set(1, labels=dict(type="global", version=str(global_info.current_version)))
61
66
 
62
67
  version_count: dict[str, int] = {}
68
+ n_kbs = 0
63
69
  async for kbid in iter_kbids(context):
64
70
  kb_info = await mdm.get_kb_info(kbid)
65
71
  if kb_info is not None:
66
72
  current_version = str(kb_info.current_version)
67
73
  version_count.setdefault(current_version, 0)
68
74
  version_count[current_version] += 1
75
+ n_kbs += 1
76
+
77
+ KBS_COUNT.set(n_kbs)
69
78
 
70
79
  for version, count in version_count.items():
71
80
  MIGRATION_COUNT.set(count, labels=dict(type="kb", version=version))
@@ -88,6 +97,10 @@ async def update_resource_metrics(context: ApplicationContext):
88
97
  count = cast(Tuple[int], await cur.fetchone())[0]
89
98
  PENDING_RESOURCE_COUNT.set(count)
90
99
 
100
+ await cur.execute("SELECT COUNT(*) FROM catalog")
101
+ count = cast(Tuple[int], await cur.fetchone())[0]
102
+ RESOURCES_COUNT.set(count)
103
+
91
104
 
92
105
  async def run_exporter_task(context: ApplicationContext, exporter_task: Callable, interval: int):
93
106
  """
@@ -109,7 +122,7 @@ async def run_exporter(context: ApplicationContext):
109
122
  # Schedule exporter tasks
110
123
  tasks = []
111
124
  for export_task, interval in [
112
- (update_migration_metrics, 60 * 3),
125
+ (update_kb_metrics, 60 * 3),
113
126
  (update_resource_metrics, 60 * 5),
114
127
  ]:
115
128
  tasks.append(asyncio.create_task(run_exporter_task(context, export_task, interval=interval)))
@@ -48,45 +48,39 @@ class MigrationsDataManager:
48
48
 
49
49
  async def schedule_all_kbs(self, target_version: int) -> None:
50
50
  # Get all kb ids
51
- async with self.driver.transaction(read_only=True) as txn:
51
+ async with self.driver.ro_transaction() as txn:
52
52
  kbids = [kbid async for kbid, _ in datamanagers.kb.get_kbs(txn)]
53
53
  # Schedule the migrations
54
- async with self.driver.transaction() as txn:
54
+ async with self.driver.rw_transaction() as txn:
55
55
  for kbid in kbids:
56
56
  await txn.set(MIGRATIONS_KEY.format(kbid=kbid), str(target_version).encode())
57
57
  await txn.commit()
58
58
 
59
59
  async def get_kb_migrations(self) -> list[str]:
60
60
  keys = []
61
- async with self.driver.transaction() as txn:
61
+ async with self.driver.ro_transaction() as txn:
62
62
  async for key in txn.keys(MIGRATIONS_CONTAINER_KEY):
63
63
  keys.append(key.split("/")[-1])
64
64
 
65
65
  return keys
66
66
 
67
67
  async def delete_kb_migration(self, *, kbid: str) -> None:
68
- async with self.driver.transaction() as txn:
68
+ async with self.driver.rw_transaction() as txn:
69
69
  await txn.delete(MIGRATIONS_KEY.format(kbid=kbid))
70
70
  await txn.commit()
71
71
 
72
72
  async def get_kb_info(self, kbid: str) -> Optional[KnowledgeBoxInfo]:
73
- async with self.driver.transaction(read_only=True) as txn:
73
+ async with self.driver.ro_transaction() as txn:
74
74
  kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
75
75
  if kb_config is None:
76
76
  return None
77
77
  return KnowledgeBoxInfo(current_version=kb_config.migration_version)
78
78
 
79
79
  async def update_kb_info(self, *, kbid: str, current_version: int) -> None:
80
- async with self.driver.transaction() as txn:
81
- kb_config = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
82
- if kb_config is None:
83
- raise Exception(f"KB {kbid} does not exist")
84
- kb_config.migration_version = current_version
85
- await KnowledgeBoxORM.update(txn, kbid, config=kb_config)
86
- await txn.commit()
80
+ await KnowledgeBoxORM.update(self.driver, kbid, migration_version=current_version)
87
81
 
88
82
  async def get_global_info(self) -> GlobalInfo:
89
- async with self.driver.transaction(read_only=True) as txn:
83
+ async with self.driver.ro_transaction() as txn:
90
84
  raw_pb = await txn.get(MIGRATION_INFO_KEY)
91
85
  if raw_pb is None:
92
86
  return GlobalInfo(current_version=0, target_version=None)
@@ -100,7 +94,7 @@ class MigrationsDataManager:
100
94
  current_version: Union[int, _Unset] = _UNSET,
101
95
  target_version: Union[int, None, _Unset] = _UNSET,
102
96
  ) -> None:
103
- async with self.driver.transaction() as txn:
97
+ async with self.driver.rw_transaction() as txn:
104
98
  raw_pb = await txn.get(MIGRATION_INFO_KEY, for_update=True)
105
99
  pb = migrations_pb2.MigrationInfo()
106
100
  if raw_pb is not None:
@@ -116,18 +110,18 @@ class MigrationsDataManager:
116
110
 
117
111
  async def get_kbs_to_rollover(self) -> list[str]:
118
112
  keys = []
119
- async with self.driver.transaction() as txn:
113
+ async with self.driver.ro_transaction() as txn:
120
114
  async for key in txn.keys(ROLLOVER_CONTAINER_KEY):
121
115
  keys.append(key.split("/")[-1])
122
116
 
123
117
  return keys
124
118
 
125
119
  async def add_kb_rollover(self, kbid: str) -> None:
126
- async with self.driver.transaction() as txn:
120
+ async with self.driver.rw_transaction() as txn:
127
121
  await txn.set(ROLLOVER_KEY.format(kbid=kbid), b"")
128
122
  await txn.commit()
129
123
 
130
124
  async def delete_kb_rollover(self, kbid: str) -> None:
131
- async with self.driver.transaction() as txn:
125
+ async with self.driver.rw_transaction() as txn:
132
126
  await txn.delete(ROLLOVER_KEY.format(kbid=kbid))
133
127
  await txn.commit()
@@ -211,7 +211,7 @@ async def run_pg_schema_migrations(driver: PGDriver):
211
211
 
212
212
  # The migration uses two transactions. The former is only used to get a lock (pg_advisory_lock)
213
213
  # without having to worry about correctly unlocking it (postgres unlocks it when the transaction ends)
214
- async with driver.transaction() as tx_lock, tx_lock.connection.cursor() as cur_lock: # type: ignore[attr-defined]
214
+ async with driver.rw_transaction() as tx_lock, tx_lock.connection.cursor() as cur_lock: # type: ignore[attr-defined]
215
215
  await cur_lock.execute(
216
216
  "CREATE TABLE IF NOT EXISTS migrations (version INT PRIMARY KEY, migrated_at TIMESTAMP NOT NULL DEFAULT NOW())"
217
217
  )
@@ -227,7 +227,7 @@ async def run_pg_schema_migrations(driver: PGDriver):
227
227
 
228
228
  # Gets a new transaction for each migration, so if they get interrupted we at least
229
229
  # save the state of the last finished transaction
230
- async with driver.transaction() as tx, tx.connection.cursor() as cur: # type: ignore[attr-defined]
230
+ async with driver.rw_transaction() as tx, tx.connection.cursor() as cur: # type: ignore[attr-defined]
231
231
  await migration.migrate(tx)
232
232
  await cur.execute("INSERT INTO migrations (version) VALUES (%s)", (version,))
233
233
  await tx.commit()
@@ -19,6 +19,7 @@
19
19
  #
20
20
  import asyncio
21
21
  import importlib.metadata
22
+ from itertools import batched # type: ignore
22
23
  from typing import AsyncGenerator
23
24
 
24
25
  from nucliadb.common import datamanagers
@@ -47,7 +48,7 @@ from nucliadb_utils.utilities import get_storage
47
48
 
48
49
 
49
50
  async def _iter_keys(driver: Driver, match: str) -> AsyncGenerator[str, None]:
50
- async with driver.transaction(read_only=True) as keys_txn:
51
+ async with driver.ro_transaction() as keys_txn:
51
52
  async for key in keys_txn.keys(match=match):
52
53
  yield key
53
54
 
@@ -81,9 +82,9 @@ async def purge_kb(driver: Driver):
81
82
  )
82
83
  continue
83
84
 
84
- # Now delete the tikv delete mark
85
+ # Now delete the delete mark
85
86
  try:
86
- async with driver.transaction() as txn:
87
+ async with driver.rw_transaction() as txn:
87
88
  key_to_purge = KB_TO_DELETE.format(kbid=kbid)
88
89
  await txn.delete(key_to_purge)
89
90
  await txn.commit()
@@ -126,7 +127,7 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
126
127
 
127
128
  if delete_marker:
128
129
  try:
129
- async with driver.transaction() as txn:
130
+ async with driver.rw_transaction() as txn:
130
131
  await txn.delete(key)
131
132
  await txn.commit()
132
133
  logger.info(f" √ Deleted storage deletion marker {key}")
@@ -163,7 +164,7 @@ async def _count_resources_storage_to_purge(driver: Driver) -> int:
163
164
  """
164
165
  Count the number of resources marked as deleted in storage.
165
166
  """
166
- async with driver.transaction(read_only=True) as txn:
167
+ async with driver.ro_transaction() as txn:
167
168
  return await txn.count(match=RESOURCE_TO_DELETE_STORAGE_BASE)
168
169
 
169
170
 
@@ -174,7 +175,7 @@ async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch
174
175
  """
175
176
  # Get the keys of the resources to delete in batches of 100
176
177
  to_delete_batch = []
177
- async with driver.transaction(read_only=True) as txn:
178
+ async with driver.ro_transaction() as txn:
178
179
  async for key in txn.keys(match=RESOURCE_TO_DELETE_STORAGE_BASE, count=batch_size):
179
180
  to_delete_batch.append(key)
180
181
 
@@ -194,7 +195,7 @@ async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch
194
195
  await asyncio.gather(*tasks)
195
196
 
196
197
  # Delete the schedule-to-delete keys
197
- async with driver.transaction() as txn:
198
+ async with driver.rw_transaction() as txn:
198
199
  for key in to_delete_batch:
199
200
  await txn.delete(key)
200
201
  await txn.commit()
@@ -220,20 +221,20 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
220
221
  continue
221
222
 
222
223
  try:
223
- async with driver.transaction(read_only=True) as txn:
224
+ async with driver.ro_transaction() as txn:
224
225
  value = await txn.get(key)
225
226
  assert value is not None, "Key must exist or we wouldn't had fetch it iterating keys"
226
227
  purge_payload = VectorSetPurge()
227
228
  purge_payload.ParseFromString(value)
228
229
 
229
230
  fields: list[Field] = []
230
- async with driver.transaction(read_only=True) as txn:
231
+ async with driver.ro_transaction() as txn:
231
232
  kb = KnowledgeBox(txn, storage, kbid)
232
233
  async for resource in kb.iterate_resources():
233
234
  fields.extend((await resource.get_fields(force=True)).values())
234
235
 
235
236
  logger.info(f"Purging {len(fields)} fields for vectorset {vectorset}", extra={"kbid": kbid})
236
- for fields_batch in batchify(fields, 20):
237
+ for fields_batch in batched(fields, n=20):
237
238
  tasks = []
238
239
  for field in fields_batch:
239
240
  if purge_payload.storage_key_kind == VectorSetConfig.StorageKeyKind.UNSET:
@@ -259,7 +260,7 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
259
260
  await asyncio.gather(*tasks)
260
261
 
261
262
  # Finally, delete the key
262
- async with driver.transaction() as txn:
263
+ async with driver.rw_transaction() as txn:
263
264
  await txn.delete(key)
264
265
  await txn.commit()
265
266
 
@@ -317,9 +318,3 @@ def run() -> int: # pragma: no cover
317
318
  setup_logging()
318
319
  errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
319
320
  return asyncio.run(main())
320
-
321
-
322
- def batchify(iterable, n=1):
323
- """Yield successive n-sized chunks from iterable."""
324
- for i in range(0, len(iterable), n):
325
- yield iterable[i : i + n]
@@ -100,7 +100,7 @@ async def _get_indexed_shards() -> dict[str, ShardKb]:
100
100
  async def _get_stored_shards(driver: Driver) -> dict[str, ShardKb]:
101
101
  stored_shards: dict[str, ShardKb] = {}
102
102
 
103
- async with driver.transaction(read_only=True) as txn:
103
+ async with driver.ro_transaction() as txn:
104
104
  async for kbid, _ in datamanagers.kb.get_kbs(txn):
105
105
  kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
106
106
  if kb_shards is None:
@@ -135,7 +135,7 @@ async def _get_kbid(shard_id: str) -> Optional[str]:
135
135
  async def report_orphan_shards(driver: Driver):
136
136
  orphan_shards = await detect_orphan_shards(driver)
137
137
  logger.info(f"Found {len(orphan_shards)} orphan shards")
138
- async with driver.transaction(read_only=True) as txn:
138
+ async with driver.ro_transaction() as txn:
139
139
  for shard_id, kbid in orphan_shards.items():
140
140
  if kbid == UNKNOWN_KB:
141
141
  msg = "Found orphan shard but could not get KB info"
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from fastapi import HTTPException
20
+ from fastapi import Header, HTTPException
21
21
  from fastapi_versioning import version
22
22
  from starlette.requests import Request
23
23
 
@@ -44,12 +44,20 @@ from nucliadb_utils.authentication import requires, requires_one
44
44
  )
45
45
  @requires(NucliaDBRoles.MANAGER)
46
46
  @version(1)
47
- async def get_kbs(request: Request, prefix: str = "") -> KnowledgeBoxList:
47
+ async def get_kbs(
48
+ request: Request,
49
+ prefix: str = "",
50
+ x_nucliadb_account: str = Header(default="", include_in_schema=False),
51
+ ) -> KnowledgeBoxList:
48
52
  driver = get_driver()
49
- async with driver.transaction(read_only=True) as txn:
53
+ async with driver.ro_transaction() as txn:
50
54
  response = KnowledgeBoxList()
51
55
  async for kbid, slug in datamanagers.kb.get_kbs(txn, prefix=prefix):
52
- response.kbs.append(KnowledgeBoxObjSummary(slug=slug or None, uuid=kbid))
56
+ response.kbs.append(
57
+ KnowledgeBoxObjSummary(
58
+ slug=user_kb_slug(slug, account_id=x_nucliadb_account) or None, uuid=kbid
59
+ )
60
+ )
53
61
  return response
54
62
 
55
63
 
@@ -62,16 +70,18 @@ async def get_kbs(request: Request, prefix: str = "") -> KnowledgeBoxList:
62
70
  )
63
71
  @requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.READER])
64
72
  @version(1)
65
- async def get_kb(request: Request, kbid: str) -> KnowledgeBoxObj:
73
+ async def get_kb(
74
+ request: Request, kbid: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
75
+ ) -> KnowledgeBoxObj:
66
76
  driver = get_driver()
67
- async with driver.transaction(read_only=True) as txn:
77
+ async with driver.ro_transaction() as txn:
68
78
  kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
69
79
  if kb_config is None:
70
80
  raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
71
81
 
72
82
  return KnowledgeBoxObj(
73
83
  uuid=kbid,
74
- slug=kb_config.slug,
84
+ slug=user_kb_slug(kb_config.slug, account_id=x_nucliadb_account),
75
85
  config=from_proto.knowledgebox_config(kb_config),
76
86
  )
77
87
 
@@ -85,12 +95,18 @@ async def get_kb(request: Request, kbid: str) -> KnowledgeBoxObj:
85
95
  )
86
96
  @requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.READER])
87
97
  @version(1)
88
- async def get_kb_by_slug(request: Request, slug: str) -> KnowledgeBoxObj:
98
+ async def get_kb_by_slug(
99
+ request: Request, slug: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
100
+ ) -> KnowledgeBoxObj:
89
101
  driver = get_driver()
90
- async with driver.transaction(read_only=True) as txn:
91
- kbid = await datamanagers.kb.get_kb_uuid(txn, slug=slug)
102
+ async with driver.ro_transaction() as txn:
103
+ # For cloud, the account id is prepended in order to be able to reuse the same slug in different accounts.
104
+ kbid = await datamanagers.kb.get_kb_uuid(txn, slug=f"{x_nucliadb_account}:{slug}")
92
105
  if kbid is None:
93
- raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
106
+ # For onprem, the slug is fully controlled by the user
107
+ kbid = await datamanagers.kb.get_kb_uuid(txn, slug=slug)
108
+ if kbid is None:
109
+ raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
94
110
 
95
111
  kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
96
112
  if kb_config is None:
@@ -98,6 +114,18 @@ async def get_kb_by_slug(request: Request, slug: str) -> KnowledgeBoxObj:
98
114
 
99
115
  return KnowledgeBoxObj(
100
116
  uuid=kbid,
101
- slug=kb_config.slug,
117
+ slug=user_kb_slug(kb_config.slug, account_id=x_nucliadb_account),
102
118
  config=from_proto.knowledgebox_config(kb_config),
103
119
  )
120
+
121
+
122
+ def user_kb_slug(stored_slug: str, account_id: str) -> str:
123
+ if account_id != "":
124
+ # On cloud deployments, backend prepends the account id to the user-defined slug.
125
+ # This is required to make kb slugs reused across different accounts using the same nucliadb.
126
+ # We strip it so the user does not see it.
127
+ return stored_slug.split(f"{account_id}:")[-1]
128
+ else:
129
+ # On on-prem deployments, the account_id is set to "" by default and we don't need to strip
130
+ # anything as the backend is not invovled in the kb creation process.
131
+ return stored_slug