nucliadb 6.3.1.post3571__py3-none-any.whl → 6.3.1.post3577__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
+ import logging
21
22
  import tarfile
22
23
  from datetime import datetime, timezone
23
24
  from typing import AsyncIterator, Optional
@@ -29,6 +30,7 @@ from nucliadb.backups.const import (
29
30
  )
30
31
  from nucliadb.backups.models import BackupMetadata, CreateBackupRequest
31
32
  from nucliadb.backups.settings import settings
33
+ from nucliadb.backups.utils import exists_in_storge
32
34
  from nucliadb.common import datamanagers
33
35
  from nucliadb.common.context import ApplicationContext
34
36
  from nucliadb.export_import.utils import (
@@ -44,6 +46,8 @@ from nucliadb_utils.audit.stream import StreamAuditStorage
44
46
  from nucliadb_utils.storages.storage import StorageField
45
47
  from nucliadb_utils.utilities import get_audit
46
48
 
49
+ logger = logging.getLogger(__name__)
50
+
47
51
 
48
52
  async def backup_kb_task(context: ApplicationContext, msg: CreateBackupRequest):
49
53
  kbid = msg.kb_id
@@ -101,6 +105,10 @@ async def backup_resources(context: ApplicationContext, kbid: str, backup_id: st
101
105
  await set_metadata(context, kbid, backup_id, metadata)
102
106
  tasks = []
103
107
  backing_up = []
108
+ logger.info(
109
+ f"Backup resources: {len(metadata.missing_resources)} remaining",
110
+ extra={"kbid": kbid, "backup_id": backup_id},
111
+ )
104
112
  if len(tasks) > 0:
105
113
  resources_bytes = await asyncio.gather(*tasks)
106
114
  metadata.total_size += sum(resources_bytes)
@@ -108,6 +116,7 @@ async def backup_resources(context: ApplicationContext, kbid: str, backup_id: st
108
116
  await set_metadata(context, kbid, backup_id, metadata)
109
117
  tasks = []
110
118
  backing_up = []
119
+ logger.info(f"Backup resources: completed", extra={"kbid": kbid, "backup_id": backup_id})
111
120
 
112
121
 
113
122
  async def backup_resource(context: ApplicationContext, backup_id: str, kbid: str, rid: str) -> int:
@@ -163,6 +172,13 @@ async def backup_resource_with_binaries(
163
172
  nonlocal total_size
164
173
 
165
174
  for cloud_file in get_cloud_files(bm):
175
+ if not await exists_cf(context, cloud_file):
176
+ logger.warning(
177
+ "Cloud file not found in storage, skipping",
178
+ extra={"kbid": kbid, "rid": rid, "cf_uri": cloud_file.uri},
179
+ )
180
+ continue
181
+
166
182
  serialized_cf = cloud_file.SerializeToString()
167
183
 
168
184
  async def cf_iterator():
@@ -244,6 +260,10 @@ async def delete_metadata(context: ApplicationContext, kbid: str, backup_id: str
244
260
  await txn.commit()
245
261
 
246
262
 
263
+ async def exists_cf(context: ApplicationContext, cf: resources_pb2.CloudFile) -> bool:
264
+ return await exists_in_storge(context.blob_storage, cf.bucket_name, cf.uri)
265
+
266
+
247
267
  async def upload_to_bucket(context: ApplicationContext, bytes_iterator: AsyncIterator[bytes], key: str):
248
268
  storage = context.blob_storage
249
269
  bucket = settings.backups_bucket
@@ -21,6 +21,7 @@
21
21
 
22
22
  import asyncio
23
23
  import functools
24
+ import logging
24
25
  import tarfile
25
26
  from typing import AsyncIterator, Callable, Optional, Union
26
27
 
@@ -39,6 +40,8 @@ from nucliadb_protos import knowledgebox_pb2 as kb_pb2
39
40
  from nucliadb_protos.resources_pb2 import CloudFile
40
41
  from nucliadb_protos.writer_pb2 import BrokerMessage
41
42
 
43
+ logger = logging.getLogger(__name__)
44
+
42
45
 
43
46
  async def restore_kb_task(context: ApplicationContext, msg: RestoreBackupRequest):
44
47
  kbid = msg.kb_id
@@ -193,7 +196,7 @@ class ResourceBackupReader:
193
196
  elif tarinfo.name.startswith("cloud-files"):
194
197
  raw_cf = await self.read_data(tarinfo)
195
198
  cf = CloudFile()
196
- cf.FromString(raw_cf)
199
+ cf.ParseFromString(raw_cf)
197
200
  return cf
198
201
  elif tarinfo.name.startswith("binaries"):
199
202
  uri = tarinfo.name.lstrip("binaries/")
@@ -219,14 +222,19 @@ async def restore_resource(context: ApplicationContext, kbid: str, backup_id: st
219
222
  bm = item
220
223
  bm.kbid = kbid
221
224
  break
222
-
223
- # Read the cloud file and its binary
224
- cf = await reader.read_item()
225
- assert isinstance(cf, CloudFile)
226
- cf_binary = await reader.read_item()
227
- assert isinstance(cf_binary, CloudFileBinary)
228
- assert cf.uri == cf_binary.uri
229
- await import_binary(context, kbid, cf, cf_binary.read)
225
+ elif isinstance(item, CloudFile):
226
+ # Read its binary and import it
227
+ cf = item
228
+ cf_binary = await reader.read_item()
229
+ assert isinstance(cf_binary, CloudFileBinary)
230
+ assert cf.uri == cf_binary.uri
231
+ await import_binary(context, kbid, cf, cf_binary.read)
232
+ else:
233
+ logger.error(
234
+ "Unexpected item in resource backup. Backup may be corrupted",
235
+ extra={"item_type": type(item), kbid: kbid, resource_id: resource_id},
236
+ )
237
+ continue
230
238
 
231
239
  await import_broker_message(context, kbid, bm)
232
240
 
nucliadb/backups/utils.py CHANGED
@@ -24,9 +24,12 @@ from nucliadb_utils.storages.storage import Storage
24
24
 
25
25
 
26
26
  async def exists_backup(storage: Storage, backup_id: str) -> bool:
27
- async for _ in storage.iterate_objects(
28
- bucket=settings.backups_bucket,
29
- prefix=StorageKeys.BACKUP_PREFIX.format(backup_id=backup_id),
30
- ):
27
+ return await exists_in_storge(
28
+ storage, settings.backups_bucket, StorageKeys.BACKUP_PREFIX.format(backup_id=backup_id)
29
+ )
30
+
31
+
32
+ async def exists_in_storge(storage: Storage, bucket: str, key: str) -> bool:
33
+ async for _ in storage.iterate_objects(bucket=bucket, prefix=key):
31
34
  return True
32
35
  return False
@@ -272,6 +272,7 @@ async def get_relations_results_from_entities(
272
272
  timeout: Optional[float] = None,
273
273
  only_with_metadata: bool = False,
274
274
  only_agentic_relations: bool = False,
275
+ only_entity_to_entity: bool = False,
275
276
  deleted_entities: set[str] = set(),
276
277
  ) -> Relations:
277
278
  request = SearchRequest()
@@ -295,7 +296,11 @@ async def get_relations_results_from_entities(
295
296
  )
296
297
  relations_results: list[RelationSearchResponse] = [result.relation for result in results]
297
298
  return await merge_relations_results(
298
- relations_results, request.relation_subgraph, only_with_metadata, only_agentic_relations
299
+ relations_results,
300
+ request.relation_subgraph,
301
+ only_with_metadata,
302
+ only_agentic_relations,
303
+ only_entity_to_entity,
299
304
  )
300
305
 
301
306
 
@@ -369,8 +369,10 @@ async def get_graph_results(
369
369
  kbid=kbid,
370
370
  entities=entities_to_explore,
371
371
  timeout=5.0,
372
- only_with_metadata=True,
372
+ only_with_metadata=not graph_strategy.relation_text_as_paragraphs,
373
373
  only_agentic_relations=graph_strategy.agentic_graph_only,
374
+ # We only want entity to entity relations (skip resource/labels/collaborators/etc.)
375
+ only_entity_to_entity=True,
374
376
  deleted_entities=explored_entities,
375
377
  )
376
378
  except Exception as e:
@@ -683,6 +685,7 @@ def build_text_blocks_from_relations(
683
685
  triplets: dict[tuple[str, str, str], tuple[float, Relations, Optional[ParagraphId]]] = defaultdict(
684
686
  lambda: (0.0, Relations(entities={}), None)
685
687
  )
688
+ paragraph_count = 0
686
689
  for ent, subgraph in relations.entities.items():
687
690
  for rel, score in zip(subgraph.related_to, scores[ent]):
688
691
  key = (
@@ -702,6 +705,14 @@ def build_text_blocks_from_relations(
702
705
  # we keep the first one, but we lose the other ones
703
706
  if p_id is None and rel.metadata and rel.metadata.paragraph_id:
704
707
  p_id = ParagraphId.from_string(rel.metadata.paragraph_id)
708
+ else:
709
+ # No paragraph ID set, fake it so we can hydrate the resource
710
+ p_id = ParagraphId(
711
+ field_id=FieldId(rel.resource_id, "a", "usermetadata"),
712
+ paragraph_start=paragraph_count,
713
+ paragraph_end=paragraph_count + 1,
714
+ )
715
+ paragraph_count += 1
705
716
  existing_relations.entities[ent].related_to.append(rel)
706
717
  # XXX: Here we use the max even though all relations with same triplet should have same score
707
718
  triplets[key] = (max(existing_score, score), existing_relations, p_id)
@@ -35,6 +35,7 @@ from nucliadb.search.search.fetch import (
35
35
  )
36
36
  from nucliadb_models.common import FieldTypeName
37
37
  from nucliadb_models.labels import translate_system_to_alias_label
38
+ from nucliadb_models.metadata import RelationType
38
39
  from nucliadb_models.resource import ExtractedDataTypeName
39
40
  from nucliadb_models.search import (
40
41
  DirectionalRelation,
@@ -445,6 +446,7 @@ async def merge_relations_results(
445
446
  query: EntitiesSubgraphRequest,
446
447
  only_with_metadata: bool = False,
447
448
  only_agentic: bool = False,
449
+ only_entity_to_entity: bool = False,
448
450
  ) -> Relations:
449
451
  loop = asyncio.get_event_loop()
450
452
  return await loop.run_in_executor(
@@ -454,6 +456,7 @@ async def merge_relations_results(
454
456
  query,
455
457
  only_with_metadata,
456
458
  only_agentic,
459
+ only_entity_to_entity,
457
460
  )
458
461
 
459
462
 
@@ -462,6 +465,7 @@ def _merge_relations_results(
462
465
  query: EntitiesSubgraphRequest,
463
466
  only_with_metadata: bool,
464
467
  only_agentic: bool,
468
+ only_entity_to_entity: bool,
465
469
  ) -> Relations:
466
470
  """
467
471
  Merge relation search responses into a single Relations object while applying filters.
@@ -490,33 +494,41 @@ def _merge_relations_results(
490
494
  # If only_with_metadata is True, we check that metadata for the relation is not None
491
495
  # If only_agentic is True, we check that metadata for the relation is not None and that it has a data_augmentation_task_id
492
496
  # TODO: This is suboptimal, we should be able to filter this in the query to the index,
493
- if (not only_with_metadata or metadata) and (
494
- not only_agentic or (metadata and metadata.data_augmentation_task_id)
495
- ):
496
- if origin.value in relations.entities:
497
- relations.entities[origin.value].related_to.append(
498
- DirectionalRelation(
499
- entity=destination.value,
500
- entity_type=relation_node_type_to_entity_type(destination.ntype),
501
- entity_subtype=destination.subtype,
502
- relation=relation_type,
503
- relation_label=relation_label,
504
- direction=RelationDirection.OUT,
505
- metadata=from_proto.relation_metadata(metadata) if metadata else None,
506
- )
497
+ if only_with_metadata and not metadata:
498
+ continue
499
+
500
+ if only_agentic and (not metadata or not metadata.data_augmentation_task_id):
501
+ continue
502
+
503
+ if only_entity_to_entity and relation_type != RelationType.ENTITY:
504
+ continue
505
+
506
+ if origin.value in relations.entities:
507
+ relations.entities[origin.value].related_to.append(
508
+ DirectionalRelation(
509
+ entity=destination.value,
510
+ entity_type=relation_node_type_to_entity_type(destination.ntype),
511
+ entity_subtype=destination.subtype,
512
+ relation=relation_type,
513
+ relation_label=relation_label,
514
+ direction=RelationDirection.OUT,
515
+ metadata=from_proto.relation_metadata(metadata) if metadata else None,
516
+ resource_id=relation.resource_id,
507
517
  )
508
- elif destination.value in relations.entities:
509
- relations.entities[destination.value].related_to.append(
510
- DirectionalRelation(
511
- entity=origin.value,
512
- entity_type=relation_node_type_to_entity_type(origin.ntype),
513
- entity_subtype=origin.subtype,
514
- relation=relation_type,
515
- relation_label=relation_label,
516
- direction=RelationDirection.IN,
517
- metadata=from_proto.relation_metadata(metadata) if metadata else None,
518
- )
518
+ )
519
+ elif destination.value in relations.entities:
520
+ relations.entities[destination.value].related_to.append(
521
+ DirectionalRelation(
522
+ entity=origin.value,
523
+ entity_type=relation_node_type_to_entity_type(origin.ntype),
524
+ entity_subtype=origin.subtype,
525
+ relation=relation_type,
526
+ relation_label=relation_label,
527
+ direction=RelationDirection.IN,
528
+ metadata=from_proto.relation_metadata(metadata) if metadata else None,
529
+ resource_id=relation.resource_id,
519
530
  )
531
+ )
520
532
 
521
533
  return relations
522
534
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: nucliadb
3
- Version: 6.3.1.post3571
3
+ Version: 6.3.1.post3577
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: <4,>=3.9
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: nucliadb-telemetry[all]>=6.3.1.post3571
24
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.1.post3571
25
- Requires-Dist: nucliadb-protos>=6.3.1.post3571
26
- Requires-Dist: nucliadb-models>=6.3.1.post3571
27
- Requires-Dist: nidx-protos>=6.3.1.post3571
23
+ Requires-Dist: nucliadb-telemetry[all]>=6.3.1.post3577
24
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.1.post3577
25
+ Requires-Dist: nucliadb-protos>=6.3.1.post3577
26
+ Requires-Dist: nucliadb-models>=6.3.1.post3577
27
+ Requires-Dist: nidx-protos>=6.3.1.post3577
28
28
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
29
29
  Requires-Dist: nuclia-models>=0.24.2
30
30
  Requires-Dist: uvicorn
@@ -41,13 +41,13 @@ nucliadb/openapi.py,sha256=wDiw0dVEvTpJvbatkJ0JZLkKm9RItZT5PWRHjqRfqTA,2272
41
41
  nucliadb/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
42
  nucliadb/backups/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
43
43
  nucliadb/backups/const.py,sha256=9vPAhLxQO_gNAjSdPxWuv3V66s9WcdpjOQ89CZlfmuk,1894
44
- nucliadb/backups/create.py,sha256=AM_nC7TgHOX0EFGaTXClS28jBSK28fHrKNZi14z2wek,10442
44
+ nucliadb/backups/create.py,sha256=mvirguMbtxNgSDGG81l0kkgHWJSZPk4GFyra9nAkBZM,11275
45
45
  nucliadb/backups/delete.py,sha256=1rnBhVUGYYZJXSZUrrgYMDZ5NyswEWkIA-G-crRCyHk,2404
46
46
  nucliadb/backups/models.py,sha256=-hITU4Mv6AxePu12toBu_fjpEv6vVGcwNVxV22O9jQA,1273
47
- nucliadb/backups/restore.py,sha256=xhslVvTf4H8VmDucZpjrEFpKj6csPIWBadCPMVJYKQ8,9703
47
+ nucliadb/backups/restore.py,sha256=wepEgv4vBN5yeiZU-f17PbuFV4xT4_SVKplNr8xSJrE,10001
48
48
  nucliadb/backups/settings.py,sha256=SyzsInj1BRbBI0atg5IXWbMbOZ_eVg4eSQ3IcnUhCxQ,1357
49
49
  nucliadb/backups/tasks.py,sha256=4_kOVJ2yCwMvDEpzJgTuTt75TNlpq5woyw9sTAcaSkw,4194
50
- nucliadb/backups/utils.py,sha256=ayDaxfWP5cPnAkQH-tF4M6cnowsPQgU2ljYz_iL1CbE,1249
50
+ nucliadb/backups/utils.py,sha256=b1hi0gEp90tNrWHejNVoUgRpa4D6uKGhbACq0yeLkJY,1375
51
51
  nucliadb/common/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
52
52
  nucliadb/common/constants.py,sha256=QpigxJh_CtD85Evy0PtV5cVq6x0U_f9xfIcXz1ymkUg,869
53
53
  nucliadb/common/counters.py,sha256=8lOi3A2HeLDDlcNaS2QT1SfD3350VPBjiY3FkmHH1V8,977
@@ -224,10 +224,10 @@ nucliadb/search/search/fetch.py,sha256=XJHIFnZmXM_8Kb37lb4lg1GYG7cZ1plT-qAIb_Qzi
224
224
  nucliadb/search/search/filters.py,sha256=1MkHlJjAQqoRCj7e5cEzK2HvBxGLE17I_omsjiklbtw,6476
225
225
  nucliadb/search/search/find.py,sha256=jQZOqu8VeX8k3ELV8bLK4TwUUjGrvmubouxvO1IvJV0,10236
226
226
  nucliadb/search/search/find_merge.py,sha256=3FnzKFEnVemg6FO_6zveulbAU7klvsiPEBvLrpBBMg8,17450
227
- nucliadb/search/search/graph_strategy.py,sha256=ahwcUTQZ0Ll-rnS285DO9PmRyiM-1p4BM3UvmOYVwhM,31750
227
+ nucliadb/search/search/graph_strategy.py,sha256=gisL2GpbSIa_SucyOwEt7TWdqURyAQqxvD_-PkXQct8,32339
228
228
  nucliadb/search/search/hydrator.py,sha256=-R37gCrGxkyaiHQalnTWHNG_FCx11Zucd7qA1vQCxuw,6985
229
229
  nucliadb/search/search/ingestion_agents.py,sha256=NeJr4EEX-bvFFMGvXOOwLv8uU7NuQ-ntJnnrhnKfMzY,3174
230
- nucliadb/search/search/merge.py,sha256=i_PTBFRqC5iTTziOMEltxLIlmokIou5hjjgR4BnoLBE,22635
230
+ nucliadb/search/search/merge.py,sha256=aUn6f5XnwWzUFhVC6uBqHE8NKdlfgw_xcTo57rS23U8,22950
231
231
  nucliadb/search/search/metrics.py,sha256=GGGtXHLhK79_ESV277xkBVjcaMURXHCxYG0EdGamUd8,2886
232
232
  nucliadb/search/search/paragraphs.py,sha256=pNAEiYqJGGUVcEf7xf-PFMVqz0PX4Qb-WNG-_zPGN2o,7799
233
233
  nucliadb/search/search/pgcatalog.py,sha256=V1NYLEUSXHpWmgcPIo1HS2riK_HDXSi-uykJjSoOOrE,9033
@@ -243,7 +243,7 @@ nucliadb/search/search/chat/ask.py,sha256=olZT08JVo3ZGDsDXkjvI2JTlqQln_o91HJzv0T
243
243
  nucliadb/search/search/chat/exceptions.py,sha256=Siy4GXW2L7oPhIR86H3WHBhE9lkV4A4YaAszuGGUf54,1356
244
244
  nucliadb/search/search/chat/images.py,sha256=PA8VWxT5_HUGfW1ULhKTK46UBsVyINtWWqEM1ulzX1E,3095
245
245
  nucliadb/search/search/chat/prompt.py,sha256=Jnja-Ss7skgnnDY8BymVfdeYsFPnIQFL8tEvcRXTKUE,47356
246
- nucliadb/search/search/chat/query.py,sha256=2QhVzvX12zLHOpVZ5MlBflqAauyCBl6dojhRGdm_6qU,16388
246
+ nucliadb/search/search/chat/query.py,sha256=0IoeW-JNaRBe2d9C3bXNfkYpzmsN_IIg3U4Vqb8eOEk,16485
247
247
  nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
248
248
  nucliadb/search/search/query_parser/catalog.py,sha256=PtH5nb6UTzH8l7Lmdd1RgLVFsn9CN5M5-JkVq9YeR4k,7116
249
249
  nucliadb/search/search/query_parser/exceptions.py,sha256=szAOXUZ27oNY-OSa9t2hQ5HHkQQC0EX1FZz_LluJHJE,1224
@@ -347,8 +347,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
347
347
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
348
348
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
349
349
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
350
- nucliadb-6.3.1.post3571.dist-info/METADATA,sha256=s2P-Covs_cwHf5pNszgsGypGofi5r1zgIOV7ccxAh6M,4291
351
- nucliadb-6.3.1.post3571.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
352
- nucliadb-6.3.1.post3571.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
353
- nucliadb-6.3.1.post3571.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
354
- nucliadb-6.3.1.post3571.dist-info/RECORD,,
350
+ nucliadb-6.3.1.post3577.dist-info/METADATA,sha256=7cMll6LH15F3_kAg1yrlOSDK8XFk77amc4TEa0kApug,4291
351
+ nucliadb-6.3.1.post3577.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
352
+ nucliadb-6.3.1.post3577.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
353
+ nucliadb-6.3.1.post3577.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
354
+ nucliadb-6.3.1.post3577.dist-info/RECORD,,