nucliadb 6.4.0.post4210__py3-none-any.whl → 6.4.0.post4224__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nucliadb/ingest/app.py CHANGED
@@ -103,7 +103,7 @@ async def initialize_pull_workers() -> list[Callable[[], Awaitable[None]]]:
103
103
  finalizers = await initialize_grpc()
104
104
  pull_workers = await consumer_service.start_pull_workers(SERVICE_NAME)
105
105
 
106
- return [pull_workers] + finalizers
106
+ return pull_workers + finalizers
107
107
 
108
108
 
109
109
  async def main_consumer(): # pragma: no cover
@@ -117,7 +117,7 @@ async def main_consumer(): # pragma: no cover
117
117
  ingest_consumers = await consumer_service.start_ingest_consumers(SERVICE_NAME)
118
118
 
119
119
  await run_until_exit(
120
- [grpc_health_finalizer, pull_workers, ingest_consumers, metrics_server.shutdown] + finalizers
120
+ [grpc_health_finalizer, ingest_consumers, metrics_server.shutdown] + pull_workers + finalizers
121
121
  )
122
122
 
123
123
 
@@ -19,11 +19,14 @@
19
19
  #
20
20
  import asyncio
21
21
  import base64
22
+ from datetime import datetime, timezone
22
23
  from typing import Optional
23
24
 
24
25
  from aiohttp.client_exceptions import ClientConnectorError
25
26
 
26
27
  from nucliadb.common import datamanagers
28
+ from nucliadb.common.back_pressure.materializer import BackPressureMaterializer
29
+ from nucliadb.common.back_pressure.utils import BackPressureException
27
30
  from nucliadb.common.http_clients.processing import ProcessingHTTPClient, get_nua_api_id
28
31
  from nucliadb.common.maindb.driver import Driver
29
32
  from nucliadb.ingest import logger, logger_activity
@@ -57,6 +60,7 @@ class PullWorker:
57
60
  local_subscriber: bool = False,
58
61
  pull_time_empty_backoff: float = 5.0,
59
62
  pull_api_timeout: int = 60,
63
+ back_pressure: Optional[BackPressureMaterializer] = None,
60
64
  ):
61
65
  self.partition = partition
62
66
  self.pull_time_error_backoff = pull_time_error_backoff
@@ -65,6 +69,7 @@ class PullWorker:
65
69
  self.local_subscriber = local_subscriber
66
70
 
67
71
  self.processor = Processor(driver, storage, pubsub, partition)
72
+ self.back_pressure = back_pressure
68
73
 
69
74
  def __str__(self) -> str:
70
75
  return f"PullWorker(partition={self.partition})"
@@ -112,11 +117,29 @@ class PullWorker:
112
117
  transaction_check=False,
113
118
  )
114
119
 
120
+ async def back_pressure_check(self) -> None:
121
+ if self.back_pressure is None:
122
+ return
123
+ while True:
124
+ try:
125
+ self.back_pressure.check_indexing()
126
+ self.back_pressure.check_ingest()
127
+ break
128
+ except BackPressureException as exc:
129
+ sleep_time = (datetime.now(timezone.utc) - exc.data.try_after).total_seconds()
130
+ logger.warning(f"Back pressure active! Sleeping for {sleep_time} seconds", exc_info=True)
131
+ await asyncio.sleep(sleep_time)
132
+ except Exception as e:
133
+ errors.capture_exception(e)
134
+ logger.exception("Error while checking back pressure. Moving on")
135
+ break
136
+
115
137
  async def loop(self):
116
138
  """
117
139
  Run this forever
118
140
  """
119
141
  while True:
142
+ await self.back_pressure_check()
120
143
  try:
121
144
  await self._loop()
122
145
  except ReallyStopPulling:
@@ -22,18 +22,22 @@ import sys
22
22
  from functools import partial
23
23
  from typing import Awaitable, Callable, Optional
24
24
 
25
+ from nucliadb.common.back_pressure.materializer import BackPressureMaterializer
26
+ from nucliadb.common.back_pressure.settings import settings as back_pressure_settings
27
+ from nucliadb.common.back_pressure.utils import is_back_pressure_enabled
25
28
  from nucliadb.common.maindb.utils import setup_driver
26
29
  from nucliadb.ingest import SERVICE_NAME, logger
27
30
  from nucliadb.ingest.consumer.consumer import IngestConsumer, IngestProcessedConsumer
28
31
  from nucliadb.ingest.consumer.pull import PullWorker
29
32
  from nucliadb.ingest.settings import settings
30
33
  from nucliadb_utils.exceptions import ConfigurationError
31
- from nucliadb_utils.settings import transaction_settings
34
+ from nucliadb_utils.settings import indexing_settings, transaction_settings
32
35
  from nucliadb_utils.utilities import (
33
36
  get_audit,
34
37
  get_nats_manager,
35
38
  get_pubsub,
36
39
  get_storage,
40
+ start_nats_manager,
37
41
  )
38
42
 
39
43
  from .auditing import IndexAuditHandler, ResourceWritesAuditHandler
@@ -54,12 +58,39 @@ async def _exit_tasks(tasks: list[asyncio.Task]) -> None:
54
58
  await asyncio.gather(*tasks, return_exceptions=True)
55
59
 
56
60
 
61
+ async def start_back_pressure() -> BackPressureMaterializer:
62
+ logger.info("Starting back pressure materializer")
63
+ nats_manager = await start_nats_manager(
64
+ SERVICE_NAME,
65
+ indexing_settings.index_jetstream_servers,
66
+ indexing_settings.index_jetstream_auth,
67
+ )
68
+ back_pressure = BackPressureMaterializer(
69
+ nats_manager,
70
+ indexing_check_interval=back_pressure_settings.indexing_check_interval,
71
+ ingest_check_interval=back_pressure_settings.ingest_check_interval,
72
+ )
73
+ await back_pressure.start()
74
+ return back_pressure
75
+
76
+
77
+ async def stop_back_pressure(materializer: BackPressureMaterializer) -> None:
78
+ await materializer.stop()
79
+ await materializer.nats_manager.finalize()
80
+
81
+
57
82
  async def start_pull_workers(
58
83
  service_name: Optional[str] = None,
59
- ) -> Callable[[], Awaitable[None]]:
84
+ ) -> list[Callable[[], Awaitable[None]]]:
85
+ finalizers: list[Callable[[], Awaitable[None]]] = []
86
+
60
87
  driver = await setup_driver()
61
88
  pubsub = await get_pubsub()
62
89
  storage = await get_storage(service_name=service_name or SERVICE_NAME)
90
+ back_pressure = None
91
+ if is_back_pressure_enabled():
92
+ back_pressure = await start_back_pressure()
93
+ finalizers.append(partial(stop_back_pressure, back_pressure))
63
94
  tasks = []
64
95
  for partition in settings.partitions:
65
96
  worker = PullWorker(
@@ -70,12 +101,14 @@ async def start_pull_workers(
70
101
  pubsub=pubsub,
71
102
  local_subscriber=transaction_settings.transaction_local,
72
103
  pull_api_timeout=settings.pull_api_timeout,
104
+ back_pressure=back_pressure,
73
105
  )
74
106
  task = asyncio.create_task(worker.loop())
75
107
  task.add_done_callback(_handle_task_result)
76
108
  tasks.append(task)
77
-
78
- return partial(_exit_tasks, tasks)
109
+ if len(tasks):
110
+ finalizers.append(partial(_exit_tasks, tasks))
111
+ return finalizers
79
112
 
80
113
 
81
114
  async def start_ingest_consumers(
@@ -70,7 +70,7 @@ class ParagraphClassifications:
70
70
  denied: dict[str, list[str]]
71
71
 
72
72
 
73
- class ResourceBrainV2:
73
+ class ResourceBrain:
74
74
  def __init__(self, rid: str):
75
75
  self.rid = rid
76
76
  self.brain: PBBrainResource = PBBrainResource(resource=ResourceID(uuid=rid))
@@ -27,14 +27,12 @@ from nidx_protos.noderesources_pb2 import Resource as IndexMessage
27
27
  from nucliadb.common import datamanagers
28
28
  from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
29
29
  from nucliadb.ingest.fields.file import File
30
- from nucliadb.ingest.orm.brain_v2 import ResourceBrainV2 as ResourceBrain
30
+ from nucliadb.ingest.orm.brain_v2 import ResourceBrain
31
31
  from nucliadb.ingest.orm.metrics import index_message_observer as observer
32
32
  from nucliadb.ingest.orm.resource import Resource, get_file_page_positions
33
33
  from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
34
34
  from nucliadb_protos.resources_pb2 import Basic, FieldID, FieldType
35
35
  from nucliadb_protos.writer_pb2 import BrokerMessage
36
- from nucliadb_utils import const
37
- from nucliadb_utils.utilities import has_feature
38
36
 
39
37
 
40
38
  class IndexMessageBuilder:
@@ -403,14 +401,5 @@ async def get_resource_index_message(
403
401
  """
404
402
  Get the full index message for a resource.
405
403
  """
406
- if has_feature(
407
- const.Features.INDEX_MESSAGE_GENERATION_V2,
408
- context={
409
- "kbid": resource.kb.kbid,
410
- },
411
- ):
412
- im_builder = IndexMessageBuilder(resource)
413
- return await im_builder.full(reindex=reindex)
414
- else:
415
- # TODO: remove this code when we remove the old index message generation
416
- return (await resource.generate_index_message(reindex=reindex)).brain
404
+ im_builder = IndexMessageBuilder(resource)
405
+ return await im_builder.full(reindex=reindex)
@@ -461,8 +461,8 @@ class Processor:
461
461
  source=source,
462
462
  )
463
463
 
464
- @processor_observer.wrap({"type": "generate_index_message_v2"})
465
- async def generate_index_message_v2(
464
+ @processor_observer.wrap({"type": "generate_index_message"})
465
+ async def generate_index_message(
466
466
  self,
467
467
  resource: Resource,
468
468
  messages: list[writer_pb2.BrokerMessage],
@@ -477,40 +477,6 @@ class Processor:
477
477
  else: # pragma: no cover
478
478
  raise InvalidBrokerMessage(f"Unknown broker message source: {message_source}")
479
479
 
480
- @processor_observer.wrap({"type": "generate_index_message_v1"})
481
- async def generate_index_message_v1(
482
- self,
483
- resource: Resource,
484
- messages: list[writer_pb2.BrokerMessage],
485
- ) -> PBBrainResource:
486
- if any(needs_reindex(m) for m in messages):
487
- # when reindexing, let's just generate full new index message
488
- # TODO - This should be improved in the future as it's not optimal for very large resources:
489
- # As of now, there are some API operations that require fully reindexing all the fields of a resource.
490
- # An example of this is classification label changes - we need to reindex all the fields of a resource to
491
- # propagate the label changes to the index.
492
- resource.replace_indexer(await resource.generate_index_message(reindex=True))
493
- else:
494
- # TODO - Ideally we should only update the fields that have been changed in the current transaction.
495
- await resource.compute_global_text()
496
- await resource.compute_global_tags(resource.indexer)
497
- await resource.compute_security(resource.indexer)
498
- return resource.indexer.brain
499
-
500
- async def generate_index_message(
501
- self,
502
- resource: Resource,
503
- messages: list[writer_pb2.BrokerMessage],
504
- resource_created: bool = False,
505
- ) -> PBBrainResource:
506
- if has_feature(
507
- const.Features.INDEX_MESSAGE_GENERATION_V2,
508
- context={"kbid": resource.kb.kbid},
509
- ):
510
- return await self.generate_index_message_v2(resource, messages, resource_created)
511
- else:
512
- return await self.generate_index_message_v1(resource, messages)
513
-
514
480
  async def external_index_delete_resource(
515
481
  self, external_index_manager: ExternalIndexManager, resource_uuid: str
516
482
  ):
@@ -19,11 +19,9 @@
19
19
  #
20
20
  from __future__ import annotations
21
21
 
22
- import asyncio
23
22
  import logging
24
23
  from collections import defaultdict
25
24
  from concurrent.futures import ThreadPoolExecutor
26
- from functools import partial
27
25
  from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
28
26
 
29
27
  from nucliadb.common import datamanagers
@@ -32,12 +30,11 @@ from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
32
30
  from nucliadb.common.maindb.driver import Transaction
33
31
  from nucliadb.ingest.fields.base import Field
34
32
  from nucliadb.ingest.fields.conversation import Conversation
35
- from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
36
33
  from nucliadb.ingest.fields.file import File
37
34
  from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
38
35
  from nucliadb.ingest.fields.link import Link
39
36
  from nucliadb.ingest.fields.text import Text
40
- from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
37
+ from nucliadb.ingest.orm.brain_v2 import FilePagePositions
41
38
  from nucliadb.ingest.orm.metrics import processor_observer
42
39
  from nucliadb_models import content_types
43
40
  from nucliadb_models.common import CloudLink
@@ -69,9 +66,7 @@ from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
69
66
  from nucliadb_protos.resources_pb2 import Origin as PBOrigin
70
67
  from nucliadb_protos.resources_pb2 import Relations as PBRelations
71
68
  from nucliadb_protos.writer_pb2 import BrokerMessage
72
- from nucliadb_utils import const
73
69
  from nucliadb_utils.storages.storage import Storage
74
- from nucliadb_utils.utilities import has_feature
75
70
 
76
71
  if TYPE_CHECKING: # pragma: no cover
77
72
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
@@ -121,7 +116,6 @@ class Resource:
121
116
  self.extra: Optional[PBExtra] = None
122
117
  self.security: Optional[utils_pb2.Security] = None
123
118
  self.modified: bool = False
124
- self._indexer: Optional[ResourceBrain] = None
125
119
  self._modified_extracted_text: list[FieldID] = []
126
120
 
127
121
  self.txn = txn
@@ -133,15 +127,6 @@ class Resource:
133
127
  self._previous_status: Optional[Metadata.Status.ValueType] = None
134
128
  self.user_relations: Optional[PBRelations] = None
135
129
 
136
- @property
137
- def indexer(self) -> ResourceBrain:
138
- if self._indexer is None:
139
- self._indexer = ResourceBrain(rid=self.uuid)
140
- return self._indexer
141
-
142
- def replace_indexer(self, indexer: ResourceBrain) -> None:
143
- self._indexer = indexer
144
-
145
130
  async def set_slug(self):
146
131
  basic = await self.get_basic()
147
132
  new_key = KB_RESOURCE_SLUG.format(kbid=self.kb.kbid, slug=basic.slug)
@@ -159,14 +144,6 @@ class Resource:
159
144
  if basic_in_payload.HasField("metadata") and basic_in_payload.metadata.useful:
160
145
  current_basic.metadata.status = basic_in_payload.metadata.status
161
146
 
162
- def has_index_message_v2_feature(self) -> bool:
163
- return has_feature(
164
- const.Features.INDEX_MESSAGE_GENERATION_V2,
165
- context={
166
- "kbid": self.kb.kbid,
167
- },
168
- )
169
-
170
147
  @processor_observer.wrap({"type": "set_basic"})
171
148
  async def set_basic(
172
149
  self,
@@ -219,30 +196,6 @@ class Resource:
219
196
  del self.basic.fieldmetadata[:]
220
197
  self.basic.fieldmetadata.extend(updated)
221
198
 
222
- if not self.has_index_message_v2_feature():
223
- # TODO: Remove this when we remove the old indexer is removed
224
- # All modified field metadata should be indexed
225
- # TODO: could be improved to only index the diff
226
- for user_field_metadata in self.basic.fieldmetadata:
227
- field_id = self.generate_field_id(fieldmetadata.field)
228
- field_obj = await self.get_field(
229
- fieldmetadata.field.field, fieldmetadata.field.field_type
230
- )
231
- field_metadata = await field_obj.get_field_metadata()
232
- if field_metadata is not None:
233
- page_positions: Optional[FilePagePositions] = None
234
- if isinstance(field_obj, File):
235
- page_positions = await get_file_page_positions(field_obj)
236
-
237
- self.indexer.apply_field_metadata(
238
- field_id,
239
- field_metadata,
240
- page_positions=page_positions,
241
- extracted_text=await field_obj.get_extracted_text(),
242
- basic_user_field_metadata=user_field_metadata,
243
- replace_field=True,
244
- )
245
-
246
199
  # Some basic fields are computed off field metadata.
247
200
  # This means we need to recompute upon field deletions.
248
201
  if deleted_fields is not None and len(deleted_fields) > 0:
@@ -313,66 +266,6 @@ class Resource:
313
266
  self.modified = True
314
267
  self.user_relations = payload
315
268
 
316
- @processor_observer.wrap({"type": "generate_index_message_old"})
317
- async def generate_index_message(self, reindex: bool = False) -> ResourceBrain:
318
- brain = ResourceBrain(rid=self.uuid)
319
- basic = await self.get_basic()
320
- await self.compute_security(brain)
321
- await self.compute_global_tags(brain)
322
- fields = await self.get_fields(force=True)
323
- for (type_id, field_id), field in fields.items():
324
- fieldid = FieldID(field_type=type_id, field=field_id)
325
- await self.compute_global_text_field(fieldid, brain)
326
-
327
- field_metadata = await field.get_field_metadata()
328
- field_key = self.generate_field_id(fieldid)
329
- if field_metadata is not None:
330
- page_positions: Optional[FilePagePositions] = None
331
- if type_id == FieldType.FILE and isinstance(field, File):
332
- page_positions = await get_file_page_positions(field)
333
-
334
- user_field_metadata = None
335
- if basic is not None:
336
- user_field_metadata = next(
337
- (
338
- fm
339
- for fm in basic.fieldmetadata
340
- if fm.field.field == field_id and fm.field.field_type == type_id
341
- ),
342
- None,
343
- )
344
- brain.apply_field_metadata(
345
- field_key,
346
- field_metadata,
347
- page_positions=page_positions,
348
- extracted_text=await field.get_extracted_text(),
349
- basic_user_field_metadata=user_field_metadata,
350
- replace_field=reindex,
351
- )
352
-
353
- if self.disable_vectors is False:
354
- vectorset_configs = []
355
- async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
356
- self.txn, kbid=self.kb.kbid
357
- ):
358
- vectorset_configs.append(vectorset_config)
359
-
360
- for vectorset_config in vectorset_configs:
361
- vo = await field.get_vectors(
362
- vectorset=vectorset_config.vectorset_id,
363
- storage_key_kind=vectorset_config.storage_key_kind,
364
- )
365
- if vo is not None:
366
- dimension = vectorset_config.vectorset_index_config.vector_dimension
367
- brain.apply_field_vectors(
368
- field_key,
369
- vo,
370
- vectorset=vectorset_config.vectorset_id,
371
- vector_dimension=dimension,
372
- replace_field=reindex,
373
- )
374
- return brain
375
-
376
269
  # Fields
377
270
  async def get_fields(self, force: bool = False) -> dict[tuple[FieldType.ValueType, str], Field]:
378
271
  # Get all fields
@@ -445,11 +338,6 @@ class Resource:
445
338
  if field in self.all_fields_keys:
446
339
  self.all_fields_keys.remove(field)
447
340
 
448
- # TODO: Remove this when we remove the old indexer
449
- if not self.has_index_message_v2_feature():
450
- field_key = self.generate_field_id(FieldID(field_type=type, field=key))
451
- self.indexer.delete_field(field_key=field_key)
452
-
453
341
  await field_obj.delete()
454
342
 
455
343
  def has_field(self, type: FieldType.ValueType, field: str) -> bool:
@@ -668,7 +556,6 @@ class Resource:
668
556
  update_basic_languages(self.basic, extracted_languages)
669
557
 
670
558
  # Upload to binary storage
671
- # Vector indexing
672
559
  if self.disable_vectors is False:
673
560
  await self._apply_extracted_vectors(message.field_vectors)
674
561
 
@@ -828,38 +715,7 @@ class Resource:
828
715
  field_metadata.field.field_type,
829
716
  load=False,
830
717
  )
831
- metadata = await field_obj.set_field_metadata(field_metadata)
832
-
833
- # TODO: Remove this when we remove the old indexer
834
- if not self.has_index_message_v2_feature():
835
- field_key = self.generate_field_id(field_metadata.field)
836
-
837
- page_positions: Optional[FilePagePositions] = None
838
- if field_metadata.field.field_type == FieldType.FILE and isinstance(field_obj, File):
839
- page_positions = await get_file_page_positions(field_obj)
840
-
841
- user_field_metadata = next(
842
- (
843
- fm
844
- for fm in self.basic.fieldmetadata
845
- if fm.field.field == field_metadata.field.field
846
- and fm.field.field_type == field_metadata.field.field_type
847
- ),
848
- None,
849
- )
850
-
851
- extracted_text = await field_obj.get_extracted_text()
852
- apply_field_metadata = partial(
853
- self.indexer.apply_field_metadata,
854
- field_key,
855
- metadata,
856
- page_positions=page_positions,
857
- extracted_text=extracted_text,
858
- basic_user_field_metadata=user_field_metadata,
859
- replace_field=True,
860
- )
861
- loop = asyncio.get_running_loop()
862
- await loop.run_in_executor(_executor, apply_field_metadata)
718
+ await field_obj.set_field_metadata(field_metadata)
863
719
 
864
720
  maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail)
865
721
 
@@ -913,27 +769,6 @@ class Resource:
913
769
  if vo is None:
914
770
  raise AttributeError("Vector object not found on set_vectors")
915
771
 
916
- if self.has_index_message_v2_feature():
917
- continue
918
-
919
- # TODO: Remove this when we remove the old indexer
920
- # Prepare vectors to be indexed
921
- field_key = self.generate_field_id(field_vectors.field)
922
- dimension = vectorset.vectorset_index_config.vector_dimension
923
- if not dimension:
924
- raise ValueError(f"Vector dimension not set for vectorset '{vectorset.vectorset_id}'")
925
-
926
- apply_field_vectors_partial = partial(
927
- self.indexer.apply_field_vectors,
928
- field_key,
929
- vo,
930
- vectorset=vectorset.vectorset_id,
931
- replace_field=True,
932
- vector_dimension=dimension,
933
- )
934
- loop = asyncio.get_running_loop()
935
- await loop.run_in_executor(_executor, apply_field_vectors_partial)
936
-
937
772
  async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
938
773
  field_obj = await self.get_field(
939
774
  field_large_metadata.field.field,
@@ -946,67 +781,7 @@ class Resource:
946
781
  def generate_field_id(self, field: FieldID) -> str:
947
782
  return f"{FIELD_TYPE_PB_TO_STR[field.field_type]}/{field.field}"
948
783
 
949
- async def compute_security(self, brain: ResourceBrain):
950
- security = await self.get_security()
951
- if security is None:
952
- return
953
- brain.set_security(security)
954
-
955
- @processor_observer.wrap({"type": "compute_global_tags"})
956
- async def compute_global_tags(self, brain: ResourceBrain):
957
- origin = await self.get_origin()
958
- basic = await self.get_basic()
959
- user_relations = await self.get_user_relations()
960
- if basic is None:
961
- raise KeyError("Resource not found")
962
-
963
- brain.set_processing_status(basic=basic, previous_status=self._previous_status)
964
- brain.set_resource_metadata(basic=basic, origin=origin, user_relations=user_relations)
965
- for type, field in await self.get_fields_ids(force=True):
966
- fieldobj = await self.get_field(field, type, load=False)
967
- fieldid = FieldID(field_type=type, field=field)
968
- fieldkey = self.generate_field_id(fieldid)
969
- extracted_metadata = await fieldobj.get_field_metadata()
970
- valid_user_field_metadata = None
971
- for user_field_metadata in basic.fieldmetadata:
972
- if (
973
- user_field_metadata.field.field == field
974
- and user_field_metadata.field.field_type == type
975
- ):
976
- valid_user_field_metadata = user_field_metadata
977
- break
978
- try:
979
- generated_by = await fieldobj.generated_by()
980
- except FieldAuthorNotFound:
981
- generated_by = None
982
- brain.apply_field_labels(
983
- fieldkey,
984
- extracted_metadata,
985
- self.uuid,
986
- generated_by,
987
- basic.usermetadata,
988
- valid_user_field_metadata,
989
- )
990
-
991
- @processor_observer.wrap({"type": "compute_global_text"})
992
- async def compute_global_text(self):
993
- for type, field in await self.get_fields_ids(force=True):
994
- fieldid = FieldID(field_type=type, field=field)
995
- await self.compute_global_text_field(fieldid, self.indexer)
996
-
997
- async def compute_global_text_field(self, fieldid: FieldID, brain: ResourceBrain):
998
- fieldobj = await self.get_field(fieldid.field, fieldid.field_type, load=False)
999
- fieldkey = self.generate_field_id(fieldid)
1000
- extracted_text = await fieldobj.get_extracted_text()
1001
- if extracted_text is None:
1002
- return
1003
- field_text = extracted_text.text
1004
- for _, split in extracted_text.split_text.items():
1005
- field_text += f" {split} "
1006
- brain.apply_field_text(fieldkey, field_text, replace_field=True)
1007
-
1008
784
  def clean(self):
1009
- self._indexer = None
1010
785
  self.txn = None
1011
786
 
1012
787
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.4.0.post4210
3
+ Version: 6.4.0.post4224
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: <4,>=3.9
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4210
24
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4210
25
- Requires-Dist: nucliadb-protos>=6.4.0.post4210
26
- Requires-Dist: nucliadb-models>=6.4.0.post4210
27
- Requires-Dist: nidx-protos>=6.4.0.post4210
23
+ Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4224
24
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4224
25
+ Requires-Dist: nucliadb-protos>=6.4.0.post4224
26
+ Requires-Dist: nucliadb-models>=6.4.0.post4224
27
+ Requires-Dist: nidx-protos>=6.4.0.post4224
28
28
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
29
29
  Requires-Dist: nuclia-models>=0.24.2
30
30
  Requires-Dist: uvicorn[standard]
@@ -122,7 +122,7 @@ nucliadb/export_import/models.py,sha256=dbjScNkiMRv4X3Ktudy1JRliD25bfoDTy3JmEZgQ
122
122
  nucliadb/export_import/tasks.py,sha256=DWbdqY97ffoyfipelGXz3Jqz1iam6JCjQSh367Fc3NA,2947
123
123
  nucliadb/export_import/utils.py,sha256=8XOVMYXXw8b4ikojG7RjQ4tKN3Xu7nfu2yCUOqD50sk,23216
124
124
  nucliadb/ingest/__init__.py,sha256=fsw3C38VP50km3R-nHL775LNGPpJ4JxqXJ2Ib1f5SqE,1011
125
- nucliadb/ingest/app.py,sha256=KCptzFq1Msq4eHFxvEol4TFwSLdmkG2v1EfQ3C8PhyY,7547
125
+ nucliadb/ingest/app.py,sha256=BKmjpdBEskHcRIHwOnI_jG4gFGs6dV0KKVH9MLJeA48,7546
126
126
  nucliadb/ingest/partitions.py,sha256=2NIhMYbNT0TNBL6bX1UMSi7vxFGICstCKEqsB0TXHOE,2410
127
127
  nucliadb/ingest/processing.py,sha256=QmkHq-BU4vub7JRWe9VHvQ2DcAmT6-CzgFXuZxXhcBU,20953
128
128
  nucliadb/ingest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -134,8 +134,8 @@ nucliadb/ingest/consumer/auditing.py,sha256=xK21DIa_ZAiOJVVbnkmT4jgCRGshNGyPyxsq
134
134
  nucliadb/ingest/consumer/consumer.py,sha256=OgS1fr5Yo55u-XbC6zypTH1aJ562Y1vZHnPDlJJpCXQ,13703
135
135
  nucliadb/ingest/consumer/materializer.py,sha256=tgD_rDI2twQzcz8kKNiW_L4YIth16IGh9mUfD5wiSD4,3858
136
136
  nucliadb/ingest/consumer/metrics.py,sha256=ji1l_4cKiHJthQd8YNem1ft4iMbw9KThmVvJmLcv3Xg,1075
137
- nucliadb/ingest/consumer/pull.py,sha256=EYT0ImngMQgatStG68p2GSrPQBbJxeuq8nFm8DdAbwk,9280
138
- nucliadb/ingest/consumer/service.py,sha256=BLM_dmKZkFBsYl3sj4MZZp5M3kkxHLuO7sE18PqIatw,6538
137
+ nucliadb/ingest/consumer/pull.py,sha256=vv1AyN0EhVgbgnZyT0D_1_IB4hWy7jPd4lAWPAOHGNc,10374
138
+ nucliadb/ingest/consumer/service.py,sha256=GhuqlK-9Lvhzd8kBox8wOlKlJgM3W_gssKoWSfVVdoI,7897
139
139
  nucliadb/ingest/consumer/shard_creator.py,sha256=w0smEu01FU_2cjZnsfBRNqT_Ntho11X17zTMST-vKbc,4359
140
140
  nucliadb/ingest/consumer/utils.py,sha256=jpX8D4lKzuPCpArQLZeX_Zczq3pfen_zAf8sPJfOEZU,2642
141
141
  nucliadb/ingest/fields/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
@@ -147,21 +147,20 @@ nucliadb/ingest/fields/generic.py,sha256=elgtqv15aJUq3zY7X_g0bli_2BpcwPArVvzhe54
147
147
  nucliadb/ingest/fields/link.py,sha256=kN_gjRUEEj5cy8K_BwPijYg3TiWhedc24apXYlTbRJs,4172
148
148
  nucliadb/ingest/fields/text.py,sha256=2grxo8twWbpXEd_iwUMBw9q0dWorVmlPONmY5d1ThwQ,1684
149
149
  nucliadb/ingest/orm/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
150
- nucliadb/ingest/orm/brain.py,sha256=q8iW7TWgABcGG5gIrR4SN5Flv_hOy6uuSv3VL8aWgAk,29098
151
- nucliadb/ingest/orm/brain_v2.py,sha256=0OYqH9srWghajGh0l1oqTFPBh1Jtlw3ui3Qpww6IC7A,33573
150
+ nucliadb/ingest/orm/brain_v2.py,sha256=qX81wvU-KCcEZ-hNgkQOskMOlZmdbJqDyAfe7eXbGLw,33571
152
151
  nucliadb/ingest/orm/broker_message.py,sha256=XWaiZgDOz94NPOPT-hqbRr5ZkpVimUw6PjUJNftfoVw,7514
153
152
  nucliadb/ingest/orm/entities.py,sha256=kXyeF6XOpFKhEsGLcY-GLIk21Exp0cJst4XQQ9jJoug,14791
154
153
  nucliadb/ingest/orm/exceptions.py,sha256=k4Esv4NtL4TrGTcsQpwrSfDhPQpiYcRbB1SpYmBX5MY,1432
155
- nucliadb/ingest/orm/index_message.py,sha256=7Pl2qtqoI3b3NAjWfgoiLQktayngdsJ_NfDH0wpTJBw,16041
154
+ nucliadb/ingest/orm/index_message.py,sha256=hI85nSNVChNLLdEFuEJvOt61Tsir-Gq-2_WZoayAdvk,15617
156
155
  nucliadb/ingest/orm/knowledgebox.py,sha256=_rkeTMIXMhR64gbYtZpFHoUHghV2DTJ2lUBqZsoqC_4,23898
157
156
  nucliadb/ingest/orm/metrics.py,sha256=OiuggTh-n3kZHA2G73NEUdIlh8c3yFrbusI88DK-Mko,1273
158
- nucliadb/ingest/orm/resource.py,sha256=nk-aT9mEPenmZ4blyqTbTOXCpJaSSi1AE-wTJ01V2vA,47007
157
+ nucliadb/ingest/orm/resource.py,sha256=hGELQgnzK2wIWgD478bR5OiVDyAxHn6WrFSq2YuHANU,36896
159
158
  nucliadb/ingest/orm/utils.py,sha256=fCQRuyecgqhaY7mcBG93oaXMkzkKb9BFjOcy4-ZiSNw,2693
160
159
  nucliadb/ingest/orm/processor/__init__.py,sha256=Aqd9wCNTvggkMkCY3WvoI8spdr94Jnqk-0iq9XpLs18,922
161
160
  nucliadb/ingest/orm/processor/auditing.py,sha256=TeYhXGJRyQ7ROytbb2u8R0fIh_FYi3HgTu3S1ribY3U,4623
162
161
  nucliadb/ingest/orm/processor/data_augmentation.py,sha256=v-pj4GbBWSuO8dQyahs5UDr5ghsyfhCZDS0ftKd6ZYc,5179
163
162
  nucliadb/ingest/orm/processor/pgcatalog.py,sha256=ht9_I5WlPc6sSFTY8PsxHlpjN-EsaBaChwqsLlMXwUk,3100
164
- nucliadb/ingest/orm/processor/processor.py,sha256=jLGLyfj6TIXoLubmNvQvkj5K32MxDDSoktY6M8z5pVk,33149
163
+ nucliadb/ingest/orm/processor/processor.py,sha256=jaEBwbv--WyoC8zcdxWAyF0dAzVA5crVDJl56Bqv1eI,31444
165
164
  nucliadb/ingest/orm/processor/sequence_manager.py,sha256=uqEphtI1Ir_yk9jRl2gPf7BlzzXWovbARY5MNZSBI_8,1704
166
165
  nucliadb/ingest/service/__init__.py,sha256=LHQFUkdmNBOWqBG0Md9sMMI7g5TQZ-hLAnhw6ZblrJg,2002
167
166
  nucliadb/ingest/service/exceptions.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
@@ -369,8 +368,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
369
368
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
370
369
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
371
370
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
372
- nucliadb-6.4.0.post4210.dist-info/METADATA,sha256=SB9gIMgWxoWNtUEexRLH85E0PL-MnroGhJ6aOambTT4,4223
373
- nucliadb-6.4.0.post4210.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
374
- nucliadb-6.4.0.post4210.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
375
- nucliadb-6.4.0.post4210.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
376
- nucliadb-6.4.0.post4210.dist-info/RECORD,,
371
+ nucliadb-6.4.0.post4224.dist-info/METADATA,sha256=G9L1810f7GDMjI54RDmZj-ZcpBD3_duqsGRR2q3c6yY,4223
372
+ nucliadb-6.4.0.post4224.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
373
+ nucliadb-6.4.0.post4224.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
374
+ nucliadb-6.4.0.post4224.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
375
+ nucliadb-6.4.0.post4224.dist-info/RECORD,,