nucliadb 6.4.0.post4210__py3-none-any.whl → 6.4.0.post4224__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/ingest/app.py +2 -2
- nucliadb/ingest/consumer/pull.py +23 -0
- nucliadb/ingest/consumer/service.py +37 -4
- nucliadb/ingest/orm/brain_v2.py +1 -1
- nucliadb/ingest/orm/index_message.py +3 -14
- nucliadb/ingest/orm/processor/processor.py +2 -36
- nucliadb/ingest/orm/resource.py +2 -227
- {nucliadb-6.4.0.post4210.dist-info → nucliadb-6.4.0.post4224.dist-info}/METADATA +6 -6
- {nucliadb-6.4.0.post4210.dist-info → nucliadb-6.4.0.post4224.dist-info}/RECORD +12 -13
- nucliadb/ingest/orm/brain.py +0 -695
- {nucliadb-6.4.0.post4210.dist-info → nucliadb-6.4.0.post4224.dist-info}/WHEEL +0 -0
- {nucliadb-6.4.0.post4210.dist-info → nucliadb-6.4.0.post4224.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.4.0.post4210.dist-info → nucliadb-6.4.0.post4224.dist-info}/top_level.txt +0 -0
nucliadb/ingest/app.py
CHANGED
@@ -103,7 +103,7 @@ async def initialize_pull_workers() -> list[Callable[[], Awaitable[None]]]:
|
|
103
103
|
finalizers = await initialize_grpc()
|
104
104
|
pull_workers = await consumer_service.start_pull_workers(SERVICE_NAME)
|
105
105
|
|
106
|
-
return
|
106
|
+
return pull_workers + finalizers
|
107
107
|
|
108
108
|
|
109
109
|
async def main_consumer(): # pragma: no cover
|
@@ -117,7 +117,7 @@ async def main_consumer(): # pragma: no cover
|
|
117
117
|
ingest_consumers = await consumer_service.start_ingest_consumers(SERVICE_NAME)
|
118
118
|
|
119
119
|
await run_until_exit(
|
120
|
-
[grpc_health_finalizer,
|
120
|
+
[grpc_health_finalizer, ingest_consumers, metrics_server.shutdown] + pull_workers + finalizers
|
121
121
|
)
|
122
122
|
|
123
123
|
|
nucliadb/ingest/consumer/pull.py
CHANGED
@@ -19,11 +19,14 @@
|
|
19
19
|
#
|
20
20
|
import asyncio
|
21
21
|
import base64
|
22
|
+
from datetime import datetime, timezone
|
22
23
|
from typing import Optional
|
23
24
|
|
24
25
|
from aiohttp.client_exceptions import ClientConnectorError
|
25
26
|
|
26
27
|
from nucliadb.common import datamanagers
|
28
|
+
from nucliadb.common.back_pressure.materializer import BackPressureMaterializer
|
29
|
+
from nucliadb.common.back_pressure.utils import BackPressureException
|
27
30
|
from nucliadb.common.http_clients.processing import ProcessingHTTPClient, get_nua_api_id
|
28
31
|
from nucliadb.common.maindb.driver import Driver
|
29
32
|
from nucliadb.ingest import logger, logger_activity
|
@@ -57,6 +60,7 @@ class PullWorker:
|
|
57
60
|
local_subscriber: bool = False,
|
58
61
|
pull_time_empty_backoff: float = 5.0,
|
59
62
|
pull_api_timeout: int = 60,
|
63
|
+
back_pressure: Optional[BackPressureMaterializer] = None,
|
60
64
|
):
|
61
65
|
self.partition = partition
|
62
66
|
self.pull_time_error_backoff = pull_time_error_backoff
|
@@ -65,6 +69,7 @@ class PullWorker:
|
|
65
69
|
self.local_subscriber = local_subscriber
|
66
70
|
|
67
71
|
self.processor = Processor(driver, storage, pubsub, partition)
|
72
|
+
self.back_pressure = back_pressure
|
68
73
|
|
69
74
|
def __str__(self) -> str:
|
70
75
|
return f"PullWorker(partition={self.partition})"
|
@@ -112,11 +117,29 @@ class PullWorker:
|
|
112
117
|
transaction_check=False,
|
113
118
|
)
|
114
119
|
|
120
|
+
async def back_pressure_check(self) -> None:
|
121
|
+
if self.back_pressure is None:
|
122
|
+
return
|
123
|
+
while True:
|
124
|
+
try:
|
125
|
+
self.back_pressure.check_indexing()
|
126
|
+
self.back_pressure.check_ingest()
|
127
|
+
break
|
128
|
+
except BackPressureException as exc:
|
129
|
+
sleep_time = (datetime.now(timezone.utc) - exc.data.try_after).total_seconds()
|
130
|
+
logger.warning(f"Back pressure active! Sleeping for {sleep_time} seconds", exc_info=True)
|
131
|
+
await asyncio.sleep(sleep_time)
|
132
|
+
except Exception as e:
|
133
|
+
errors.capture_exception(e)
|
134
|
+
logger.exception("Error while checking back pressure. Moving on")
|
135
|
+
break
|
136
|
+
|
115
137
|
async def loop(self):
|
116
138
|
"""
|
117
139
|
Run this forever
|
118
140
|
"""
|
119
141
|
while True:
|
142
|
+
await self.back_pressure_check()
|
120
143
|
try:
|
121
144
|
await self._loop()
|
122
145
|
except ReallyStopPulling:
|
@@ -22,18 +22,22 @@ import sys
|
|
22
22
|
from functools import partial
|
23
23
|
from typing import Awaitable, Callable, Optional
|
24
24
|
|
25
|
+
from nucliadb.common.back_pressure.materializer import BackPressureMaterializer
|
26
|
+
from nucliadb.common.back_pressure.settings import settings as back_pressure_settings
|
27
|
+
from nucliadb.common.back_pressure.utils import is_back_pressure_enabled
|
25
28
|
from nucliadb.common.maindb.utils import setup_driver
|
26
29
|
from nucliadb.ingest import SERVICE_NAME, logger
|
27
30
|
from nucliadb.ingest.consumer.consumer import IngestConsumer, IngestProcessedConsumer
|
28
31
|
from nucliadb.ingest.consumer.pull import PullWorker
|
29
32
|
from nucliadb.ingest.settings import settings
|
30
33
|
from nucliadb_utils.exceptions import ConfigurationError
|
31
|
-
from nucliadb_utils.settings import transaction_settings
|
34
|
+
from nucliadb_utils.settings import indexing_settings, transaction_settings
|
32
35
|
from nucliadb_utils.utilities import (
|
33
36
|
get_audit,
|
34
37
|
get_nats_manager,
|
35
38
|
get_pubsub,
|
36
39
|
get_storage,
|
40
|
+
start_nats_manager,
|
37
41
|
)
|
38
42
|
|
39
43
|
from .auditing import IndexAuditHandler, ResourceWritesAuditHandler
|
@@ -54,12 +58,39 @@ async def _exit_tasks(tasks: list[asyncio.Task]) -> None:
|
|
54
58
|
await asyncio.gather(*tasks, return_exceptions=True)
|
55
59
|
|
56
60
|
|
61
|
+
async def start_back_pressure() -> BackPressureMaterializer:
|
62
|
+
logger.info("Starting back pressure materializer")
|
63
|
+
nats_manager = await start_nats_manager(
|
64
|
+
SERVICE_NAME,
|
65
|
+
indexing_settings.index_jetstream_servers,
|
66
|
+
indexing_settings.index_jetstream_auth,
|
67
|
+
)
|
68
|
+
back_pressure = BackPressureMaterializer(
|
69
|
+
nats_manager,
|
70
|
+
indexing_check_interval=back_pressure_settings.indexing_check_interval,
|
71
|
+
ingest_check_interval=back_pressure_settings.ingest_check_interval,
|
72
|
+
)
|
73
|
+
await back_pressure.start()
|
74
|
+
return back_pressure
|
75
|
+
|
76
|
+
|
77
|
+
async def stop_back_pressure(materializer: BackPressureMaterializer) -> None:
|
78
|
+
await materializer.stop()
|
79
|
+
await materializer.nats_manager.finalize()
|
80
|
+
|
81
|
+
|
57
82
|
async def start_pull_workers(
|
58
83
|
service_name: Optional[str] = None,
|
59
|
-
) -> Callable[[], Awaitable[None]]:
|
84
|
+
) -> list[Callable[[], Awaitable[None]]]:
|
85
|
+
finalizers: list[Callable[[], Awaitable[None]]] = []
|
86
|
+
|
60
87
|
driver = await setup_driver()
|
61
88
|
pubsub = await get_pubsub()
|
62
89
|
storage = await get_storage(service_name=service_name or SERVICE_NAME)
|
90
|
+
back_pressure = None
|
91
|
+
if is_back_pressure_enabled():
|
92
|
+
back_pressure = await start_back_pressure()
|
93
|
+
finalizers.append(partial(stop_back_pressure, back_pressure))
|
63
94
|
tasks = []
|
64
95
|
for partition in settings.partitions:
|
65
96
|
worker = PullWorker(
|
@@ -70,12 +101,14 @@ async def start_pull_workers(
|
|
70
101
|
pubsub=pubsub,
|
71
102
|
local_subscriber=transaction_settings.transaction_local,
|
72
103
|
pull_api_timeout=settings.pull_api_timeout,
|
104
|
+
back_pressure=back_pressure,
|
73
105
|
)
|
74
106
|
task = asyncio.create_task(worker.loop())
|
75
107
|
task.add_done_callback(_handle_task_result)
|
76
108
|
tasks.append(task)
|
77
|
-
|
78
|
-
|
109
|
+
if len(tasks):
|
110
|
+
finalizers.append(partial(_exit_tasks, tasks))
|
111
|
+
return finalizers
|
79
112
|
|
80
113
|
|
81
114
|
async def start_ingest_consumers(
|
nucliadb/ingest/orm/brain_v2.py
CHANGED
@@ -27,14 +27,12 @@ from nidx_protos.noderesources_pb2 import Resource as IndexMessage
|
|
27
27
|
from nucliadb.common import datamanagers
|
28
28
|
from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
|
29
29
|
from nucliadb.ingest.fields.file import File
|
30
|
-
from nucliadb.ingest.orm.brain_v2 import
|
30
|
+
from nucliadb.ingest.orm.brain_v2 import ResourceBrain
|
31
31
|
from nucliadb.ingest.orm.metrics import index_message_observer as observer
|
32
32
|
from nucliadb.ingest.orm.resource import Resource, get_file_page_positions
|
33
33
|
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
34
34
|
from nucliadb_protos.resources_pb2 import Basic, FieldID, FieldType
|
35
35
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
36
|
-
from nucliadb_utils import const
|
37
|
-
from nucliadb_utils.utilities import has_feature
|
38
36
|
|
39
37
|
|
40
38
|
class IndexMessageBuilder:
|
@@ -403,14 +401,5 @@ async def get_resource_index_message(
|
|
403
401
|
"""
|
404
402
|
Get the full index message for a resource.
|
405
403
|
"""
|
406
|
-
|
407
|
-
|
408
|
-
context={
|
409
|
-
"kbid": resource.kb.kbid,
|
410
|
-
},
|
411
|
-
):
|
412
|
-
im_builder = IndexMessageBuilder(resource)
|
413
|
-
return await im_builder.full(reindex=reindex)
|
414
|
-
else:
|
415
|
-
# TODO: remove this code when we remove the old index message generation
|
416
|
-
return (await resource.generate_index_message(reindex=reindex)).brain
|
404
|
+
im_builder = IndexMessageBuilder(resource)
|
405
|
+
return await im_builder.full(reindex=reindex)
|
@@ -461,8 +461,8 @@ class Processor:
|
|
461
461
|
source=source,
|
462
462
|
)
|
463
463
|
|
464
|
-
@processor_observer.wrap({"type": "
|
465
|
-
async def
|
464
|
+
@processor_observer.wrap({"type": "generate_index_message"})
|
465
|
+
async def generate_index_message(
|
466
466
|
self,
|
467
467
|
resource: Resource,
|
468
468
|
messages: list[writer_pb2.BrokerMessage],
|
@@ -477,40 +477,6 @@ class Processor:
|
|
477
477
|
else: # pragma: no cover
|
478
478
|
raise InvalidBrokerMessage(f"Unknown broker message source: {message_source}")
|
479
479
|
|
480
|
-
@processor_observer.wrap({"type": "generate_index_message_v1"})
|
481
|
-
async def generate_index_message_v1(
|
482
|
-
self,
|
483
|
-
resource: Resource,
|
484
|
-
messages: list[writer_pb2.BrokerMessage],
|
485
|
-
) -> PBBrainResource:
|
486
|
-
if any(needs_reindex(m) for m in messages):
|
487
|
-
# when reindexing, let's just generate full new index message
|
488
|
-
# TODO - This should be improved in the future as it's not optimal for very large resources:
|
489
|
-
# As of now, there are some API operations that require fully reindexing all the fields of a resource.
|
490
|
-
# An example of this is classification label changes - we need to reindex all the fields of a resource to
|
491
|
-
# propagate the label changes to the index.
|
492
|
-
resource.replace_indexer(await resource.generate_index_message(reindex=True))
|
493
|
-
else:
|
494
|
-
# TODO - Ideally we should only update the fields that have been changed in the current transaction.
|
495
|
-
await resource.compute_global_text()
|
496
|
-
await resource.compute_global_tags(resource.indexer)
|
497
|
-
await resource.compute_security(resource.indexer)
|
498
|
-
return resource.indexer.brain
|
499
|
-
|
500
|
-
async def generate_index_message(
|
501
|
-
self,
|
502
|
-
resource: Resource,
|
503
|
-
messages: list[writer_pb2.BrokerMessage],
|
504
|
-
resource_created: bool = False,
|
505
|
-
) -> PBBrainResource:
|
506
|
-
if has_feature(
|
507
|
-
const.Features.INDEX_MESSAGE_GENERATION_V2,
|
508
|
-
context={"kbid": resource.kb.kbid},
|
509
|
-
):
|
510
|
-
return await self.generate_index_message_v2(resource, messages, resource_created)
|
511
|
-
else:
|
512
|
-
return await self.generate_index_message_v1(resource, messages)
|
513
|
-
|
514
480
|
async def external_index_delete_resource(
|
515
481
|
self, external_index_manager: ExternalIndexManager, resource_uuid: str
|
516
482
|
):
|
nucliadb/ingest/orm/resource.py
CHANGED
@@ -19,11 +19,9 @@
|
|
19
19
|
#
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
|
-
import asyncio
|
23
22
|
import logging
|
24
23
|
from collections import defaultdict
|
25
24
|
from concurrent.futures import ThreadPoolExecutor
|
26
|
-
from functools import partial
|
27
25
|
from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
|
28
26
|
|
29
27
|
from nucliadb.common import datamanagers
|
@@ -32,12 +30,11 @@ from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
|
|
32
30
|
from nucliadb.common.maindb.driver import Transaction
|
33
31
|
from nucliadb.ingest.fields.base import Field
|
34
32
|
from nucliadb.ingest.fields.conversation import Conversation
|
35
|
-
from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
|
36
33
|
from nucliadb.ingest.fields.file import File
|
37
34
|
from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
|
38
35
|
from nucliadb.ingest.fields.link import Link
|
39
36
|
from nucliadb.ingest.fields.text import Text
|
40
|
-
from nucliadb.ingest.orm.
|
37
|
+
from nucliadb.ingest.orm.brain_v2 import FilePagePositions
|
41
38
|
from nucliadb.ingest.orm.metrics import processor_observer
|
42
39
|
from nucliadb_models import content_types
|
43
40
|
from nucliadb_models.common import CloudLink
|
@@ -69,9 +66,7 @@ from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
|
|
69
66
|
from nucliadb_protos.resources_pb2 import Origin as PBOrigin
|
70
67
|
from nucliadb_protos.resources_pb2 import Relations as PBRelations
|
71
68
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
72
|
-
from nucliadb_utils import const
|
73
69
|
from nucliadb_utils.storages.storage import Storage
|
74
|
-
from nucliadb_utils.utilities import has_feature
|
75
70
|
|
76
71
|
if TYPE_CHECKING: # pragma: no cover
|
77
72
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
@@ -121,7 +116,6 @@ class Resource:
|
|
121
116
|
self.extra: Optional[PBExtra] = None
|
122
117
|
self.security: Optional[utils_pb2.Security] = None
|
123
118
|
self.modified: bool = False
|
124
|
-
self._indexer: Optional[ResourceBrain] = None
|
125
119
|
self._modified_extracted_text: list[FieldID] = []
|
126
120
|
|
127
121
|
self.txn = txn
|
@@ -133,15 +127,6 @@ class Resource:
|
|
133
127
|
self._previous_status: Optional[Metadata.Status.ValueType] = None
|
134
128
|
self.user_relations: Optional[PBRelations] = None
|
135
129
|
|
136
|
-
@property
|
137
|
-
def indexer(self) -> ResourceBrain:
|
138
|
-
if self._indexer is None:
|
139
|
-
self._indexer = ResourceBrain(rid=self.uuid)
|
140
|
-
return self._indexer
|
141
|
-
|
142
|
-
def replace_indexer(self, indexer: ResourceBrain) -> None:
|
143
|
-
self._indexer = indexer
|
144
|
-
|
145
130
|
async def set_slug(self):
|
146
131
|
basic = await self.get_basic()
|
147
132
|
new_key = KB_RESOURCE_SLUG.format(kbid=self.kb.kbid, slug=basic.slug)
|
@@ -159,14 +144,6 @@ class Resource:
|
|
159
144
|
if basic_in_payload.HasField("metadata") and basic_in_payload.metadata.useful:
|
160
145
|
current_basic.metadata.status = basic_in_payload.metadata.status
|
161
146
|
|
162
|
-
def has_index_message_v2_feature(self) -> bool:
|
163
|
-
return has_feature(
|
164
|
-
const.Features.INDEX_MESSAGE_GENERATION_V2,
|
165
|
-
context={
|
166
|
-
"kbid": self.kb.kbid,
|
167
|
-
},
|
168
|
-
)
|
169
|
-
|
170
147
|
@processor_observer.wrap({"type": "set_basic"})
|
171
148
|
async def set_basic(
|
172
149
|
self,
|
@@ -219,30 +196,6 @@ class Resource:
|
|
219
196
|
del self.basic.fieldmetadata[:]
|
220
197
|
self.basic.fieldmetadata.extend(updated)
|
221
198
|
|
222
|
-
if not self.has_index_message_v2_feature():
|
223
|
-
# TODO: Remove this when we remove the old indexer is removed
|
224
|
-
# All modified field metadata should be indexed
|
225
|
-
# TODO: could be improved to only index the diff
|
226
|
-
for user_field_metadata in self.basic.fieldmetadata:
|
227
|
-
field_id = self.generate_field_id(fieldmetadata.field)
|
228
|
-
field_obj = await self.get_field(
|
229
|
-
fieldmetadata.field.field, fieldmetadata.field.field_type
|
230
|
-
)
|
231
|
-
field_metadata = await field_obj.get_field_metadata()
|
232
|
-
if field_metadata is not None:
|
233
|
-
page_positions: Optional[FilePagePositions] = None
|
234
|
-
if isinstance(field_obj, File):
|
235
|
-
page_positions = await get_file_page_positions(field_obj)
|
236
|
-
|
237
|
-
self.indexer.apply_field_metadata(
|
238
|
-
field_id,
|
239
|
-
field_metadata,
|
240
|
-
page_positions=page_positions,
|
241
|
-
extracted_text=await field_obj.get_extracted_text(),
|
242
|
-
basic_user_field_metadata=user_field_metadata,
|
243
|
-
replace_field=True,
|
244
|
-
)
|
245
|
-
|
246
199
|
# Some basic fields are computed off field metadata.
|
247
200
|
# This means we need to recompute upon field deletions.
|
248
201
|
if deleted_fields is not None and len(deleted_fields) > 0:
|
@@ -313,66 +266,6 @@ class Resource:
|
|
313
266
|
self.modified = True
|
314
267
|
self.user_relations = payload
|
315
268
|
|
316
|
-
@processor_observer.wrap({"type": "generate_index_message_old"})
|
317
|
-
async def generate_index_message(self, reindex: bool = False) -> ResourceBrain:
|
318
|
-
brain = ResourceBrain(rid=self.uuid)
|
319
|
-
basic = await self.get_basic()
|
320
|
-
await self.compute_security(brain)
|
321
|
-
await self.compute_global_tags(brain)
|
322
|
-
fields = await self.get_fields(force=True)
|
323
|
-
for (type_id, field_id), field in fields.items():
|
324
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
325
|
-
await self.compute_global_text_field(fieldid, brain)
|
326
|
-
|
327
|
-
field_metadata = await field.get_field_metadata()
|
328
|
-
field_key = self.generate_field_id(fieldid)
|
329
|
-
if field_metadata is not None:
|
330
|
-
page_positions: Optional[FilePagePositions] = None
|
331
|
-
if type_id == FieldType.FILE and isinstance(field, File):
|
332
|
-
page_positions = await get_file_page_positions(field)
|
333
|
-
|
334
|
-
user_field_metadata = None
|
335
|
-
if basic is not None:
|
336
|
-
user_field_metadata = next(
|
337
|
-
(
|
338
|
-
fm
|
339
|
-
for fm in basic.fieldmetadata
|
340
|
-
if fm.field.field == field_id and fm.field.field_type == type_id
|
341
|
-
),
|
342
|
-
None,
|
343
|
-
)
|
344
|
-
brain.apply_field_metadata(
|
345
|
-
field_key,
|
346
|
-
field_metadata,
|
347
|
-
page_positions=page_positions,
|
348
|
-
extracted_text=await field.get_extracted_text(),
|
349
|
-
basic_user_field_metadata=user_field_metadata,
|
350
|
-
replace_field=reindex,
|
351
|
-
)
|
352
|
-
|
353
|
-
if self.disable_vectors is False:
|
354
|
-
vectorset_configs = []
|
355
|
-
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
|
356
|
-
self.txn, kbid=self.kb.kbid
|
357
|
-
):
|
358
|
-
vectorset_configs.append(vectorset_config)
|
359
|
-
|
360
|
-
for vectorset_config in vectorset_configs:
|
361
|
-
vo = await field.get_vectors(
|
362
|
-
vectorset=vectorset_config.vectorset_id,
|
363
|
-
storage_key_kind=vectorset_config.storage_key_kind,
|
364
|
-
)
|
365
|
-
if vo is not None:
|
366
|
-
dimension = vectorset_config.vectorset_index_config.vector_dimension
|
367
|
-
brain.apply_field_vectors(
|
368
|
-
field_key,
|
369
|
-
vo,
|
370
|
-
vectorset=vectorset_config.vectorset_id,
|
371
|
-
vector_dimension=dimension,
|
372
|
-
replace_field=reindex,
|
373
|
-
)
|
374
|
-
return brain
|
375
|
-
|
376
269
|
# Fields
|
377
270
|
async def get_fields(self, force: bool = False) -> dict[tuple[FieldType.ValueType, str], Field]:
|
378
271
|
# Get all fields
|
@@ -445,11 +338,6 @@ class Resource:
|
|
445
338
|
if field in self.all_fields_keys:
|
446
339
|
self.all_fields_keys.remove(field)
|
447
340
|
|
448
|
-
# TODO: Remove this when we remove the old indexer
|
449
|
-
if not self.has_index_message_v2_feature():
|
450
|
-
field_key = self.generate_field_id(FieldID(field_type=type, field=key))
|
451
|
-
self.indexer.delete_field(field_key=field_key)
|
452
|
-
|
453
341
|
await field_obj.delete()
|
454
342
|
|
455
343
|
def has_field(self, type: FieldType.ValueType, field: str) -> bool:
|
@@ -668,7 +556,6 @@ class Resource:
|
|
668
556
|
update_basic_languages(self.basic, extracted_languages)
|
669
557
|
|
670
558
|
# Upload to binary storage
|
671
|
-
# Vector indexing
|
672
559
|
if self.disable_vectors is False:
|
673
560
|
await self._apply_extracted_vectors(message.field_vectors)
|
674
561
|
|
@@ -828,38 +715,7 @@ class Resource:
|
|
828
715
|
field_metadata.field.field_type,
|
829
716
|
load=False,
|
830
717
|
)
|
831
|
-
|
832
|
-
|
833
|
-
# TODO: Remove this when we remove the old indexer
|
834
|
-
if not self.has_index_message_v2_feature():
|
835
|
-
field_key = self.generate_field_id(field_metadata.field)
|
836
|
-
|
837
|
-
page_positions: Optional[FilePagePositions] = None
|
838
|
-
if field_metadata.field.field_type == FieldType.FILE and isinstance(field_obj, File):
|
839
|
-
page_positions = await get_file_page_positions(field_obj)
|
840
|
-
|
841
|
-
user_field_metadata = next(
|
842
|
-
(
|
843
|
-
fm
|
844
|
-
for fm in self.basic.fieldmetadata
|
845
|
-
if fm.field.field == field_metadata.field.field
|
846
|
-
and fm.field.field_type == field_metadata.field.field_type
|
847
|
-
),
|
848
|
-
None,
|
849
|
-
)
|
850
|
-
|
851
|
-
extracted_text = await field_obj.get_extracted_text()
|
852
|
-
apply_field_metadata = partial(
|
853
|
-
self.indexer.apply_field_metadata,
|
854
|
-
field_key,
|
855
|
-
metadata,
|
856
|
-
page_positions=page_positions,
|
857
|
-
extracted_text=extracted_text,
|
858
|
-
basic_user_field_metadata=user_field_metadata,
|
859
|
-
replace_field=True,
|
860
|
-
)
|
861
|
-
loop = asyncio.get_running_loop()
|
862
|
-
await loop.run_in_executor(_executor, apply_field_metadata)
|
718
|
+
await field_obj.set_field_metadata(field_metadata)
|
863
719
|
|
864
720
|
maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail)
|
865
721
|
|
@@ -913,27 +769,6 @@ class Resource:
|
|
913
769
|
if vo is None:
|
914
770
|
raise AttributeError("Vector object not found on set_vectors")
|
915
771
|
|
916
|
-
if self.has_index_message_v2_feature():
|
917
|
-
continue
|
918
|
-
|
919
|
-
# TODO: Remove this when we remove the old indexer
|
920
|
-
# Prepare vectors to be indexed
|
921
|
-
field_key = self.generate_field_id(field_vectors.field)
|
922
|
-
dimension = vectorset.vectorset_index_config.vector_dimension
|
923
|
-
if not dimension:
|
924
|
-
raise ValueError(f"Vector dimension not set for vectorset '{vectorset.vectorset_id}'")
|
925
|
-
|
926
|
-
apply_field_vectors_partial = partial(
|
927
|
-
self.indexer.apply_field_vectors,
|
928
|
-
field_key,
|
929
|
-
vo,
|
930
|
-
vectorset=vectorset.vectorset_id,
|
931
|
-
replace_field=True,
|
932
|
-
vector_dimension=dimension,
|
933
|
-
)
|
934
|
-
loop = asyncio.get_running_loop()
|
935
|
-
await loop.run_in_executor(_executor, apply_field_vectors_partial)
|
936
|
-
|
937
772
|
async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
|
938
773
|
field_obj = await self.get_field(
|
939
774
|
field_large_metadata.field.field,
|
@@ -946,67 +781,7 @@ class Resource:
|
|
946
781
|
def generate_field_id(self, field: FieldID) -> str:
|
947
782
|
return f"{FIELD_TYPE_PB_TO_STR[field.field_type]}/{field.field}"
|
948
783
|
|
949
|
-
async def compute_security(self, brain: ResourceBrain):
|
950
|
-
security = await self.get_security()
|
951
|
-
if security is None:
|
952
|
-
return
|
953
|
-
brain.set_security(security)
|
954
|
-
|
955
|
-
@processor_observer.wrap({"type": "compute_global_tags"})
|
956
|
-
async def compute_global_tags(self, brain: ResourceBrain):
|
957
|
-
origin = await self.get_origin()
|
958
|
-
basic = await self.get_basic()
|
959
|
-
user_relations = await self.get_user_relations()
|
960
|
-
if basic is None:
|
961
|
-
raise KeyError("Resource not found")
|
962
|
-
|
963
|
-
brain.set_processing_status(basic=basic, previous_status=self._previous_status)
|
964
|
-
brain.set_resource_metadata(basic=basic, origin=origin, user_relations=user_relations)
|
965
|
-
for type, field in await self.get_fields_ids(force=True):
|
966
|
-
fieldobj = await self.get_field(field, type, load=False)
|
967
|
-
fieldid = FieldID(field_type=type, field=field)
|
968
|
-
fieldkey = self.generate_field_id(fieldid)
|
969
|
-
extracted_metadata = await fieldobj.get_field_metadata()
|
970
|
-
valid_user_field_metadata = None
|
971
|
-
for user_field_metadata in basic.fieldmetadata:
|
972
|
-
if (
|
973
|
-
user_field_metadata.field.field == field
|
974
|
-
and user_field_metadata.field.field_type == type
|
975
|
-
):
|
976
|
-
valid_user_field_metadata = user_field_metadata
|
977
|
-
break
|
978
|
-
try:
|
979
|
-
generated_by = await fieldobj.generated_by()
|
980
|
-
except FieldAuthorNotFound:
|
981
|
-
generated_by = None
|
982
|
-
brain.apply_field_labels(
|
983
|
-
fieldkey,
|
984
|
-
extracted_metadata,
|
985
|
-
self.uuid,
|
986
|
-
generated_by,
|
987
|
-
basic.usermetadata,
|
988
|
-
valid_user_field_metadata,
|
989
|
-
)
|
990
|
-
|
991
|
-
@processor_observer.wrap({"type": "compute_global_text"})
|
992
|
-
async def compute_global_text(self):
|
993
|
-
for type, field in await self.get_fields_ids(force=True):
|
994
|
-
fieldid = FieldID(field_type=type, field=field)
|
995
|
-
await self.compute_global_text_field(fieldid, self.indexer)
|
996
|
-
|
997
|
-
async def compute_global_text_field(self, fieldid: FieldID, brain: ResourceBrain):
|
998
|
-
fieldobj = await self.get_field(fieldid.field, fieldid.field_type, load=False)
|
999
|
-
fieldkey = self.generate_field_id(fieldid)
|
1000
|
-
extracted_text = await fieldobj.get_extracted_text()
|
1001
|
-
if extracted_text is None:
|
1002
|
-
return
|
1003
|
-
field_text = extracted_text.text
|
1004
|
-
for _, split in extracted_text.split_text.items():
|
1005
|
-
field_text += f" {split} "
|
1006
|
-
brain.apply_field_text(fieldkey, field_text, replace_field=True)
|
1007
|
-
|
1008
784
|
def clean(self):
|
1009
|
-
self._indexer = None
|
1010
785
|
self.txn = None
|
1011
786
|
|
1012
787
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.4.0.
|
3
|
+
Version: 6.4.0.post4224
|
4
4
|
Summary: NucliaDB
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
6
6
|
License: AGPL
|
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
21
21
|
Requires-Python: <4,>=3.9
|
22
22
|
Description-Content-Type: text/markdown
|
23
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.4.0.
|
24
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.
|
25
|
-
Requires-Dist: nucliadb-protos>=6.4.0.
|
26
|
-
Requires-Dist: nucliadb-models>=6.4.0.
|
27
|
-
Requires-Dist: nidx-protos>=6.4.0.
|
23
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4224
|
24
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4224
|
25
|
+
Requires-Dist: nucliadb-protos>=6.4.0.post4224
|
26
|
+
Requires-Dist: nucliadb-models>=6.4.0.post4224
|
27
|
+
Requires-Dist: nidx-protos>=6.4.0.post4224
|
28
28
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
29
29
|
Requires-Dist: nuclia-models>=0.24.2
|
30
30
|
Requires-Dist: uvicorn[standard]
|
@@ -122,7 +122,7 @@ nucliadb/export_import/models.py,sha256=dbjScNkiMRv4X3Ktudy1JRliD25bfoDTy3JmEZgQ
|
|
122
122
|
nucliadb/export_import/tasks.py,sha256=DWbdqY97ffoyfipelGXz3Jqz1iam6JCjQSh367Fc3NA,2947
|
123
123
|
nucliadb/export_import/utils.py,sha256=8XOVMYXXw8b4ikojG7RjQ4tKN3Xu7nfu2yCUOqD50sk,23216
|
124
124
|
nucliadb/ingest/__init__.py,sha256=fsw3C38VP50km3R-nHL775LNGPpJ4JxqXJ2Ib1f5SqE,1011
|
125
|
-
nucliadb/ingest/app.py,sha256=
|
125
|
+
nucliadb/ingest/app.py,sha256=BKmjpdBEskHcRIHwOnI_jG4gFGs6dV0KKVH9MLJeA48,7546
|
126
126
|
nucliadb/ingest/partitions.py,sha256=2NIhMYbNT0TNBL6bX1UMSi7vxFGICstCKEqsB0TXHOE,2410
|
127
127
|
nucliadb/ingest/processing.py,sha256=QmkHq-BU4vub7JRWe9VHvQ2DcAmT6-CzgFXuZxXhcBU,20953
|
128
128
|
nucliadb/ingest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -134,8 +134,8 @@ nucliadb/ingest/consumer/auditing.py,sha256=xK21DIa_ZAiOJVVbnkmT4jgCRGshNGyPyxsq
|
|
134
134
|
nucliadb/ingest/consumer/consumer.py,sha256=OgS1fr5Yo55u-XbC6zypTH1aJ562Y1vZHnPDlJJpCXQ,13703
|
135
135
|
nucliadb/ingest/consumer/materializer.py,sha256=tgD_rDI2twQzcz8kKNiW_L4YIth16IGh9mUfD5wiSD4,3858
|
136
136
|
nucliadb/ingest/consumer/metrics.py,sha256=ji1l_4cKiHJthQd8YNem1ft4iMbw9KThmVvJmLcv3Xg,1075
|
137
|
-
nucliadb/ingest/consumer/pull.py,sha256=
|
138
|
-
nucliadb/ingest/consumer/service.py,sha256=
|
137
|
+
nucliadb/ingest/consumer/pull.py,sha256=vv1AyN0EhVgbgnZyT0D_1_IB4hWy7jPd4lAWPAOHGNc,10374
|
138
|
+
nucliadb/ingest/consumer/service.py,sha256=GhuqlK-9Lvhzd8kBox8wOlKlJgM3W_gssKoWSfVVdoI,7897
|
139
139
|
nucliadb/ingest/consumer/shard_creator.py,sha256=w0smEu01FU_2cjZnsfBRNqT_Ntho11X17zTMST-vKbc,4359
|
140
140
|
nucliadb/ingest/consumer/utils.py,sha256=jpX8D4lKzuPCpArQLZeX_Zczq3pfen_zAf8sPJfOEZU,2642
|
141
141
|
nucliadb/ingest/fields/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
@@ -147,21 +147,20 @@ nucliadb/ingest/fields/generic.py,sha256=elgtqv15aJUq3zY7X_g0bli_2BpcwPArVvzhe54
|
|
147
147
|
nucliadb/ingest/fields/link.py,sha256=kN_gjRUEEj5cy8K_BwPijYg3TiWhedc24apXYlTbRJs,4172
|
148
148
|
nucliadb/ingest/fields/text.py,sha256=2grxo8twWbpXEd_iwUMBw9q0dWorVmlPONmY5d1ThwQ,1684
|
149
149
|
nucliadb/ingest/orm/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
150
|
-
nucliadb/ingest/orm/
|
151
|
-
nucliadb/ingest/orm/brain_v2.py,sha256=0OYqH9srWghajGh0l1oqTFPBh1Jtlw3ui3Qpww6IC7A,33573
|
150
|
+
nucliadb/ingest/orm/brain_v2.py,sha256=qX81wvU-KCcEZ-hNgkQOskMOlZmdbJqDyAfe7eXbGLw,33571
|
152
151
|
nucliadb/ingest/orm/broker_message.py,sha256=XWaiZgDOz94NPOPT-hqbRr5ZkpVimUw6PjUJNftfoVw,7514
|
153
152
|
nucliadb/ingest/orm/entities.py,sha256=kXyeF6XOpFKhEsGLcY-GLIk21Exp0cJst4XQQ9jJoug,14791
|
154
153
|
nucliadb/ingest/orm/exceptions.py,sha256=k4Esv4NtL4TrGTcsQpwrSfDhPQpiYcRbB1SpYmBX5MY,1432
|
155
|
-
nucliadb/ingest/orm/index_message.py,sha256=
|
154
|
+
nucliadb/ingest/orm/index_message.py,sha256=hI85nSNVChNLLdEFuEJvOt61Tsir-Gq-2_WZoayAdvk,15617
|
156
155
|
nucliadb/ingest/orm/knowledgebox.py,sha256=_rkeTMIXMhR64gbYtZpFHoUHghV2DTJ2lUBqZsoqC_4,23898
|
157
156
|
nucliadb/ingest/orm/metrics.py,sha256=OiuggTh-n3kZHA2G73NEUdIlh8c3yFrbusI88DK-Mko,1273
|
158
|
-
nucliadb/ingest/orm/resource.py,sha256=
|
157
|
+
nucliadb/ingest/orm/resource.py,sha256=hGELQgnzK2wIWgD478bR5OiVDyAxHn6WrFSq2YuHANU,36896
|
159
158
|
nucliadb/ingest/orm/utils.py,sha256=fCQRuyecgqhaY7mcBG93oaXMkzkKb9BFjOcy4-ZiSNw,2693
|
160
159
|
nucliadb/ingest/orm/processor/__init__.py,sha256=Aqd9wCNTvggkMkCY3WvoI8spdr94Jnqk-0iq9XpLs18,922
|
161
160
|
nucliadb/ingest/orm/processor/auditing.py,sha256=TeYhXGJRyQ7ROytbb2u8R0fIh_FYi3HgTu3S1ribY3U,4623
|
162
161
|
nucliadb/ingest/orm/processor/data_augmentation.py,sha256=v-pj4GbBWSuO8dQyahs5UDr5ghsyfhCZDS0ftKd6ZYc,5179
|
163
162
|
nucliadb/ingest/orm/processor/pgcatalog.py,sha256=ht9_I5WlPc6sSFTY8PsxHlpjN-EsaBaChwqsLlMXwUk,3100
|
164
|
-
nucliadb/ingest/orm/processor/processor.py,sha256=
|
163
|
+
nucliadb/ingest/orm/processor/processor.py,sha256=jaEBwbv--WyoC8zcdxWAyF0dAzVA5crVDJl56Bqv1eI,31444
|
165
164
|
nucliadb/ingest/orm/processor/sequence_manager.py,sha256=uqEphtI1Ir_yk9jRl2gPf7BlzzXWovbARY5MNZSBI_8,1704
|
166
165
|
nucliadb/ingest/service/__init__.py,sha256=LHQFUkdmNBOWqBG0Md9sMMI7g5TQZ-hLAnhw6ZblrJg,2002
|
167
166
|
nucliadb/ingest/service/exceptions.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
@@ -369,8 +368,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
369
368
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
370
369
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
371
370
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
372
|
-
nucliadb-6.4.0.
|
373
|
-
nucliadb-6.4.0.
|
374
|
-
nucliadb-6.4.0.
|
375
|
-
nucliadb-6.4.0.
|
376
|
-
nucliadb-6.4.0.
|
371
|
+
nucliadb-6.4.0.post4224.dist-info/METADATA,sha256=G9L1810f7GDMjI54RDmZj-ZcpBD3_duqsGRR2q3c6yY,4223
|
372
|
+
nucliadb-6.4.0.post4224.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
373
|
+
nucliadb-6.4.0.post4224.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
374
|
+
nucliadb-6.4.0.post4224.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
375
|
+
nucliadb-6.4.0.post4224.dist-info/RECORD,,
|