nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/ingest/orm/resource.py
CHANGED
|
@@ -22,8 +22,9 @@ from __future__ import annotations
|
|
|
22
22
|
import asyncio
|
|
23
23
|
import logging
|
|
24
24
|
from collections import defaultdict
|
|
25
|
+
from collections.abc import Sequence
|
|
25
26
|
from concurrent.futures import ThreadPoolExecutor
|
|
26
|
-
from typing import
|
|
27
|
+
from typing import Any
|
|
27
28
|
|
|
28
29
|
from nucliadb.common import datamanagers
|
|
29
30
|
from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
|
|
@@ -68,13 +69,11 @@ from nucliadb_protos.resources_pb2 import Origin as PBOrigin
|
|
|
68
69
|
from nucliadb_protos.resources_pb2 import Relations as PBRelations
|
|
69
70
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
|
70
71
|
from nucliadb_utils.storages.storage import Storage
|
|
71
|
-
|
|
72
|
-
if TYPE_CHECKING: # pragma: no cover
|
|
73
|
-
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
|
72
|
+
from nucliadb_utils.utilities import get_storage
|
|
74
73
|
|
|
75
74
|
logger = logging.getLogger(__name__)
|
|
76
75
|
|
|
77
|
-
KB_FIELDS: dict[int,
|
|
76
|
+
KB_FIELDS: dict[int, type] = {
|
|
78
77
|
FieldType.TEXT: Text,
|
|
79
78
|
FieldType.FILE: File,
|
|
80
79
|
FieldType.LINK: Link,
|
|
@@ -104,40 +103,55 @@ class Resource:
|
|
|
104
103
|
self,
|
|
105
104
|
txn: Transaction,
|
|
106
105
|
storage: Storage,
|
|
107
|
-
|
|
106
|
+
kbid: str,
|
|
108
107
|
uuid: str,
|
|
109
|
-
basic:
|
|
108
|
+
basic: PBBasic | None = None,
|
|
110
109
|
disable_vectors: bool = True,
|
|
111
110
|
):
|
|
112
111
|
self.fields: dict[tuple[FieldType.ValueType, str], Field] = {}
|
|
113
112
|
self.conversations: dict[int, PBConversation] = {}
|
|
114
|
-
self.relations:
|
|
115
|
-
self.all_fields_keys:
|
|
116
|
-
self.origin:
|
|
117
|
-
self.extra:
|
|
118
|
-
self.security:
|
|
113
|
+
self.relations: PBRelations | None = None
|
|
114
|
+
self.all_fields_keys: list[tuple[FieldType.ValueType, str]] | None = None
|
|
115
|
+
self.origin: PBOrigin | None = None
|
|
116
|
+
self.extra: PBExtra | None = None
|
|
117
|
+
self.security: utils_pb2.Security | None = None
|
|
119
118
|
self.modified: bool = False
|
|
120
119
|
self._modified_extracted_text: list[FieldID] = []
|
|
121
120
|
|
|
122
121
|
self.txn = txn
|
|
123
122
|
self.storage = storage
|
|
124
|
-
self.
|
|
123
|
+
self.kbid = kbid
|
|
125
124
|
self.uuid = uuid
|
|
126
125
|
self.basic = basic
|
|
127
126
|
self.disable_vectors = disable_vectors
|
|
128
|
-
self._previous_status:
|
|
129
|
-
self.user_relations:
|
|
127
|
+
self._previous_status: Metadata.Status.ValueType | None = None
|
|
128
|
+
self.user_relations: PBRelations | None = None
|
|
130
129
|
self.locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
|
|
131
130
|
|
|
131
|
+
@classmethod
|
|
132
|
+
async def get(cls, txn: Transaction, kbid: str, rid: str) -> Resource | None:
|
|
133
|
+
basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=rid)
|
|
134
|
+
if basic is None:
|
|
135
|
+
return None
|
|
136
|
+
storage = await get_storage()
|
|
137
|
+
return cls(
|
|
138
|
+
txn=txn,
|
|
139
|
+
storage=storage,
|
|
140
|
+
kbid=kbid,
|
|
141
|
+
uuid=rid,
|
|
142
|
+
basic=basic,
|
|
143
|
+
disable_vectors=False,
|
|
144
|
+
)
|
|
145
|
+
|
|
132
146
|
async def set_slug(self):
|
|
133
147
|
basic = await self.get_basic()
|
|
134
|
-
new_key = KB_RESOURCE_SLUG.format(kbid=self.
|
|
148
|
+
new_key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug)
|
|
135
149
|
await self.txn.set(new_key, self.uuid.encode())
|
|
136
150
|
|
|
137
151
|
# Basic
|
|
138
|
-
async def get_basic(self) ->
|
|
152
|
+
async def get_basic(self) -> PBBasic:
|
|
139
153
|
if self.basic is None:
|
|
140
|
-
basic = await datamanagers.resources.get_basic(self.txn, kbid=self.
|
|
154
|
+
basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=self.uuid)
|
|
141
155
|
self.basic = basic if basic is not None else PBBasic()
|
|
142
156
|
return self.basic
|
|
143
157
|
|
|
@@ -159,7 +173,7 @@ class Resource:
|
|
|
159
173
|
async def set_basic(
|
|
160
174
|
self,
|
|
161
175
|
payload: PBBasic,
|
|
162
|
-
deleted_fields:
|
|
176
|
+
deleted_fields: list[FieldID] | None = None,
|
|
163
177
|
):
|
|
164
178
|
await self.get_basic()
|
|
165
179
|
|
|
@@ -212,49 +226,43 @@ class Resource:
|
|
|
212
226
|
if deleted_fields is not None and len(deleted_fields) > 0:
|
|
213
227
|
delete_basic_computedmetadata_classifications(self.basic, deleted_fields=deleted_fields)
|
|
214
228
|
|
|
215
|
-
await datamanagers.resources.set_basic(
|
|
216
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid, basic=self.basic
|
|
217
|
-
)
|
|
229
|
+
await datamanagers.resources.set_basic(self.txn, kbid=self.kbid, rid=self.uuid, basic=self.basic)
|
|
218
230
|
self.modified = True
|
|
219
231
|
|
|
220
232
|
# Origin
|
|
221
|
-
async def get_origin(self) ->
|
|
233
|
+
async def get_origin(self) -> PBOrigin | None:
|
|
222
234
|
if self.origin is None:
|
|
223
|
-
origin = await datamanagers.resources.get_origin(self.txn, kbid=self.
|
|
235
|
+
origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kbid, rid=self.uuid)
|
|
224
236
|
self.origin = origin
|
|
225
237
|
return self.origin
|
|
226
238
|
|
|
227
239
|
async def set_origin(self, payload: PBOrigin):
|
|
228
|
-
await datamanagers.resources.set_origin(
|
|
229
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid, origin=payload
|
|
230
|
-
)
|
|
240
|
+
await datamanagers.resources.set_origin(self.txn, kbid=self.kbid, rid=self.uuid, origin=payload)
|
|
231
241
|
self.modified = True
|
|
232
242
|
self.origin = payload
|
|
233
243
|
|
|
234
244
|
# Extra
|
|
235
|
-
async def get_extra(self) ->
|
|
245
|
+
async def get_extra(self) -> PBExtra | None:
|
|
236
246
|
if self.extra is None:
|
|
237
|
-
extra = await datamanagers.resources.get_extra(self.txn, kbid=self.
|
|
247
|
+
extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kbid, rid=self.uuid)
|
|
238
248
|
self.extra = extra
|
|
239
249
|
return self.extra
|
|
240
250
|
|
|
241
251
|
async def set_extra(self, payload: PBExtra):
|
|
242
|
-
await datamanagers.resources.set_extra(self.txn, kbid=self.
|
|
252
|
+
await datamanagers.resources.set_extra(self.txn, kbid=self.kbid, rid=self.uuid, extra=payload)
|
|
243
253
|
self.modified = True
|
|
244
254
|
self.extra = payload
|
|
245
255
|
|
|
246
256
|
# Security
|
|
247
|
-
async def get_security(self) ->
|
|
257
|
+
async def get_security(self) -> utils_pb2.Security | None:
|
|
248
258
|
if self.security is None:
|
|
249
|
-
security = await datamanagers.resources.get_security(
|
|
250
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid
|
|
251
|
-
)
|
|
259
|
+
security = await datamanagers.resources.get_security(self.txn, kbid=self.kbid, rid=self.uuid)
|
|
252
260
|
self.security = security
|
|
253
261
|
return self.security
|
|
254
262
|
|
|
255
263
|
async def set_security(self, payload: utils_pb2.Security) -> None:
|
|
256
264
|
await datamanagers.resources.set_security(
|
|
257
|
-
self.txn, kbid=self.
|
|
265
|
+
self.txn, kbid=self.kbid, rid=self.uuid, security=payload
|
|
258
266
|
)
|
|
259
267
|
self.modified = True
|
|
260
268
|
self.security = payload
|
|
@@ -262,7 +270,7 @@ class Resource:
|
|
|
262
270
|
# Relations
|
|
263
271
|
async def get_user_relations(self) -> PBRelations:
|
|
264
272
|
if self.user_relations is None:
|
|
265
|
-
sf = self.storage.user_relations(self.
|
|
273
|
+
sf = self.storage.user_relations(self.kbid, self.uuid)
|
|
266
274
|
relations = await self.storage.download_pb(sf, PBRelations)
|
|
267
275
|
if relations is None:
|
|
268
276
|
# Key not found = no relations
|
|
@@ -272,7 +280,7 @@ class Resource:
|
|
|
272
280
|
return self.user_relations
|
|
273
281
|
|
|
274
282
|
async def set_user_relations(self, payload: PBRelations):
|
|
275
|
-
sf = self.storage.user_relations(self.
|
|
283
|
+
sf = self.storage.user_relations(self.kbid, self.uuid)
|
|
276
284
|
await self.storage.upload_pb(sf, payload)
|
|
277
285
|
self.modified = True
|
|
278
286
|
self.user_relations = payload
|
|
@@ -354,25 +362,34 @@ class Resource:
|
|
|
354
362
|
|
|
355
363
|
await field_obj.delete()
|
|
356
364
|
|
|
365
|
+
async def field_exists(self, type: FieldType.ValueType, field: str) -> bool:
|
|
366
|
+
"""Return whether this resource has this field or not."""
|
|
367
|
+
all_fields_ids = await self.get_fields_ids()
|
|
368
|
+
for field_type, field_id in all_fields_ids:
|
|
369
|
+
if field_type == type and field_id == field:
|
|
370
|
+
return True
|
|
371
|
+
return False
|
|
372
|
+
|
|
357
373
|
def has_field(self, type: FieldType.ValueType, field: str) -> bool:
|
|
374
|
+
# REVIEW: are we sure we don't want to actually check this?
|
|
358
375
|
return (type, field) in self.fields
|
|
359
376
|
|
|
360
|
-
async def get_all_field_ids(self, *, for_update: bool) ->
|
|
377
|
+
async def get_all_field_ids(self, *, for_update: bool) -> PBAllFieldIDs | None:
|
|
361
378
|
return await datamanagers.resources.get_all_field_ids(
|
|
362
|
-
self.txn, kbid=self.
|
|
379
|
+
self.txn, kbid=self.kbid, rid=self.uuid, for_update=for_update
|
|
363
380
|
)
|
|
364
381
|
|
|
365
382
|
async def set_all_field_ids(self, all_fields: PBAllFieldIDs):
|
|
366
383
|
return await datamanagers.resources.set_all_field_ids(
|
|
367
|
-
self.txn, kbid=self.
|
|
384
|
+
self.txn, kbid=self.kbid, rid=self.uuid, allfields=all_fields
|
|
368
385
|
)
|
|
369
386
|
|
|
370
387
|
async def update_all_field_ids(
|
|
371
388
|
self,
|
|
372
389
|
*,
|
|
373
|
-
updated:
|
|
374
|
-
deleted:
|
|
375
|
-
errors:
|
|
390
|
+
updated: list[FieldID] | None = None,
|
|
391
|
+
deleted: list[FieldID] | None = None,
|
|
392
|
+
errors: list[writer_pb2.Error] | None = None,
|
|
376
393
|
):
|
|
377
394
|
needs_update = False
|
|
378
395
|
all_fields = await self.get_all_field_ids(for_update=True)
|
|
@@ -451,7 +468,7 @@ class Resource:
|
|
|
451
468
|
|
|
452
469
|
# If this message comes from the processor (not a DA worker), we clear all previous errors
|
|
453
470
|
# TODO: When generated_by is populated with DA tasks by processor, remove only related errors
|
|
454
|
-
from_processor = any(
|
|
471
|
+
from_processor = any(x.WhichOneof("generator") == "processor" for x in message.generated_by)
|
|
455
472
|
|
|
456
473
|
for (field_type, field), errors in errors_by_field.items():
|
|
457
474
|
field_obj = await self.get_field(field, field_type, load=False)
|
|
@@ -471,7 +488,7 @@ class Resource:
|
|
|
471
488
|
# We infer the status for processor messages
|
|
472
489
|
if message.source == BrokerMessage.MessageSource.PROCESSOR:
|
|
473
490
|
if any(
|
|
474
|
-
|
|
491
|
+
e.source_error.severity == writer_pb2.Error.Severity.ERROR for e in status.errors
|
|
475
492
|
):
|
|
476
493
|
status.status = writer_pb2.FieldStatus.Status.ERROR
|
|
477
494
|
else:
|
|
@@ -501,25 +518,21 @@ class Resource:
|
|
|
501
518
|
return
|
|
502
519
|
|
|
503
520
|
field_statuses = await datamanagers.fields.get_statuses(
|
|
504
|
-
self.txn, kbid=self.
|
|
521
|
+
self.txn, kbid=self.kbid, rid=self.uuid, fields=field_ids.fields
|
|
505
522
|
)
|
|
506
523
|
|
|
507
524
|
# If any field is processing -> PENDING
|
|
508
|
-
if any(
|
|
525
|
+
if any(f.status == writer_pb2.FieldStatus.Status.PENDING for f in field_statuses):
|
|
509
526
|
self.basic.metadata.status = PBMetadata.Status.PENDING
|
|
510
527
|
# If we have any non-DA error -> ERROR
|
|
511
528
|
elif any(
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
and e.source_error.code != writer_pb2.Error.ErrorCode.DATAAUGMENTATION
|
|
518
|
-
for e in f.errors
|
|
519
|
-
)
|
|
520
|
-
)
|
|
521
|
-
for f in field_statuses
|
|
529
|
+
f.status == writer_pb2.FieldStatus.Status.ERROR
|
|
530
|
+
and any(
|
|
531
|
+
e.source_error.severity == writer_pb2.Error.Severity.ERROR
|
|
532
|
+
and e.source_error.code != writer_pb2.Error.ErrorCode.DATAAUGMENTATION
|
|
533
|
+
for e in f.errors
|
|
522
534
|
)
|
|
535
|
+
for f in field_statuses
|
|
523
536
|
):
|
|
524
537
|
self.basic.metadata.status = PBMetadata.Status.ERROR
|
|
525
538
|
# Otherwise (everything processed or we only have DA errors) -> PROCESSED
|
|
@@ -642,7 +655,7 @@ class Resource:
|
|
|
642
655
|
FieldType.LINK,
|
|
643
656
|
load=False,
|
|
644
657
|
)
|
|
645
|
-
maybe_update_basic_thumbnail(self.basic, link_extracted_data.link_thumbnail, self.
|
|
658
|
+
maybe_update_basic_thumbnail(self.basic, link_extracted_data.link_thumbnail, self.kbid)
|
|
646
659
|
|
|
647
660
|
await field_link.set_link_extracted_data(link_extracted_data)
|
|
648
661
|
|
|
@@ -669,7 +682,7 @@ class Resource:
|
|
|
669
682
|
return
|
|
670
683
|
logger.info(
|
|
671
684
|
"Updating resource title from link extracted data",
|
|
672
|
-
extra={"kbid": self.
|
|
685
|
+
extra={"kbid": self.kbid, "field": link_extracted_data.field, "rid": self.uuid},
|
|
673
686
|
)
|
|
674
687
|
title = link_extracted_data.title
|
|
675
688
|
await self.update_resource_title(title)
|
|
@@ -711,7 +724,7 @@ class Resource:
|
|
|
711
724
|
# uri can change after extraction
|
|
712
725
|
await field_file.set_file_extracted_data(file_extracted_data)
|
|
713
726
|
maybe_update_basic_icon(self.basic, file_extracted_data.icon)
|
|
714
|
-
maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail, self.
|
|
727
|
+
maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail, self.kbid)
|
|
715
728
|
self.modified = True
|
|
716
729
|
|
|
717
730
|
async def _should_update_resource_title_from_file_metadata(self) -> bool:
|
|
@@ -733,7 +746,7 @@ class Resource:
|
|
|
733
746
|
filenames = set()
|
|
734
747
|
for (field_type, _), field_obj in fields.items():
|
|
735
748
|
if field_type == FieldType.FILE:
|
|
736
|
-
field_value:
|
|
749
|
+
field_value: FieldFile | None = await field_obj.get_value()
|
|
737
750
|
if field_value is not None:
|
|
738
751
|
if field_value.file.filename not in ("", None):
|
|
739
752
|
filenames.add(field_value.file.filename)
|
|
@@ -758,7 +771,7 @@ class Resource:
|
|
|
758
771
|
fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
|
|
759
772
|
logger.info(
|
|
760
773
|
"Updating resource title from file extracted data",
|
|
761
|
-
extra={"kbid": self.
|
|
774
|
+
extra={"kbid": self.kbid, "field": fid.full(), "new_title": fed.title},
|
|
762
775
|
)
|
|
763
776
|
await self.update_resource_title(fed.title)
|
|
764
777
|
await self.unmark_title_for_reset()
|
|
@@ -776,9 +789,7 @@ class Resource:
|
|
|
776
789
|
)
|
|
777
790
|
await field_obj.set_field_metadata(field_metadata)
|
|
778
791
|
|
|
779
|
-
maybe_update_basic_thumbnail(
|
|
780
|
-
self.basic, field_metadata.metadata.metadata.thumbnail, self.kb.kbid
|
|
781
|
-
)
|
|
792
|
+
maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail, self.kbid)
|
|
782
793
|
|
|
783
794
|
update_basic_computedmetadata_classifications(self.basic, field_metadata)
|
|
784
795
|
self.modified = True
|
|
@@ -790,7 +801,7 @@ class Resource:
|
|
|
790
801
|
await self.get_fields(force=True)
|
|
791
802
|
vectorsets = {
|
|
792
803
|
vectorset_id: vs
|
|
793
|
-
async for vectorset_id, vs in datamanagers.vectorsets.iter(self.txn, kbid=self.
|
|
804
|
+
async for vectorset_id, vs in datamanagers.vectorsets.iter(self.txn, kbid=self.kbid)
|
|
794
805
|
}
|
|
795
806
|
|
|
796
807
|
for field_vectors in fields_vectors:
|
|
@@ -799,13 +810,13 @@ class Resource:
|
|
|
799
810
|
assert len(vectorsets) == 1, (
|
|
800
811
|
"Invalid broker message, can't ingest vectors from unknown vectorset to KB with multiple vectorsets"
|
|
801
812
|
)
|
|
802
|
-
vectorset =
|
|
813
|
+
vectorset = next(iter(vectorsets.values()))
|
|
803
814
|
|
|
804
815
|
else:
|
|
805
816
|
if field_vectors.vectorset_id not in vectorsets:
|
|
806
817
|
logger.warning(
|
|
807
818
|
"Dropping extracted vectors for unknown vectorset",
|
|
808
|
-
extra={"kbid": self.
|
|
819
|
+
extra={"kbid": self.kbid, "vectorset": field_vectors.vectorset_id},
|
|
809
820
|
)
|
|
810
821
|
continue
|
|
811
822
|
|
|
@@ -916,7 +927,7 @@ def maybe_update_basic_summary(basic: PBBasic, summary_text: str) -> bool:
|
|
|
916
927
|
return True
|
|
917
928
|
|
|
918
929
|
|
|
919
|
-
def maybe_update_basic_icon(basic: PBBasic, mimetype:
|
|
930
|
+
def maybe_update_basic_icon(basic: PBBasic, mimetype: str | None) -> bool:
|
|
920
931
|
if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
|
|
921
932
|
# Icon already set or detected
|
|
922
933
|
return False
|
|
@@ -935,7 +946,7 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
|
|
|
935
946
|
return True
|
|
936
947
|
|
|
937
948
|
|
|
938
|
-
def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail:
|
|
949
|
+
def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: CloudFile | None, kbid: str) -> bool:
|
|
939
950
|
if basic.thumbnail or thumbnail is None:
|
|
940
951
|
return False
|
|
941
952
|
basic.thumbnail = CloudLink.format_reader_download_uri(thumbnail.uri)
|
|
@@ -972,7 +983,7 @@ def update_basic_languages(basic: Basic, languages: list[str]) -> bool:
|
|
|
972
983
|
return updated
|
|
973
984
|
|
|
974
985
|
|
|
975
|
-
def get_text_field_mimetype(bm: BrokerMessage) ->
|
|
986
|
+
def get_text_field_mimetype(bm: BrokerMessage) -> str | None:
|
|
976
987
|
if len(bm.texts) == 0:
|
|
977
988
|
return None
|
|
978
989
|
text_format = next(iter(bm.texts.values())).format
|
nucliadb/ingest/orm/utils.py
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
|
|
20
20
|
import urllib.parse
|
|
21
|
-
from
|
|
21
|
+
from collections.abc import Sequence
|
|
22
22
|
|
|
23
23
|
from nucliadb.models.internal.processing import PushPayload, PushTextFormat, Text
|
|
24
24
|
from nucliadb_protos.resources_pb2 import (
|
nucliadb/ingest/partitions.py
CHANGED
|
@@ -25,12 +25,17 @@ from nucliadb.ingest.settings import Settings
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def assign_partitions(settings: Settings):
|
|
28
|
+
"""
|
|
29
|
+
This function dynamically assigns the partitions to the current ingest sts
|
|
30
|
+
replica based on its hostname, typically (ingest-0, ingest-1, etc).
|
|
31
|
+
"""
|
|
28
32
|
# partitions start from 1, instead of 0
|
|
29
33
|
all_partitions = [str(part + 1) for part in range(settings.nuclia_partitions)]
|
|
30
34
|
|
|
31
35
|
# get replica number and total replicas from environment
|
|
32
36
|
logger.info(f"PARTITIONS: Total Replicas = {settings.total_replicas}")
|
|
33
37
|
if settings.replica_number == -1:
|
|
38
|
+
# Get replica number from hostname
|
|
34
39
|
hostname = os.environ.get("HOSTNAME")
|
|
35
40
|
if hostname is not None:
|
|
36
41
|
sts_values = hostname.split("-")
|
|
@@ -39,10 +44,16 @@ def assign_partitions(settings: Settings):
|
|
|
39
44
|
settings.replica_number = int(sts_values[-1])
|
|
40
45
|
except Exception:
|
|
41
46
|
logger.error(f"Could not extract replica number from hostname: {hostname}")
|
|
42
|
-
|
|
47
|
+
else:
|
|
48
|
+
logger.warning(f"Could not determine replica number from hostname: {hostname}")
|
|
49
|
+
else:
|
|
50
|
+
logger.warning(f"Could not determine replica number from hostname.")
|
|
43
51
|
|
|
44
52
|
if settings.replica_number == -1:
|
|
45
53
|
settings.replica_number = 0
|
|
54
|
+
else:
|
|
55
|
+
# We assume that replica numbers are set manually via env variables
|
|
56
|
+
pass
|
|
46
57
|
logger.info(f"PARTITIONS: Replica Number = {settings.replica_number}")
|
|
47
58
|
|
|
48
59
|
# calculate assigned partitions based on total replicas and own replica number
|
nucliadb/ingest/processing.py
CHANGED
|
@@ -25,7 +25,7 @@ import uuid
|
|
|
25
25
|
from collections import defaultdict
|
|
26
26
|
from contextlib import AsyncExitStack
|
|
27
27
|
from enum import Enum
|
|
28
|
-
from typing import Any
|
|
28
|
+
from typing import Any
|
|
29
29
|
|
|
30
30
|
import aiohttp
|
|
31
31
|
import backoff
|
|
@@ -132,19 +132,19 @@ def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> Process
|
|
|
132
132
|
class ProcessingEngine:
|
|
133
133
|
def __init__(
|
|
134
134
|
self,
|
|
135
|
-
nuclia_service_account:
|
|
136
|
-
nuclia_zone:
|
|
137
|
-
nuclia_public_url:
|
|
138
|
-
nuclia_processing_cluster_url:
|
|
139
|
-
onprem:
|
|
140
|
-
nuclia_jwt_key:
|
|
135
|
+
nuclia_service_account: str | None = None,
|
|
136
|
+
nuclia_zone: str | None = None,
|
|
137
|
+
nuclia_public_url: str | None = None,
|
|
138
|
+
nuclia_processing_cluster_url: str | None = None,
|
|
139
|
+
onprem: bool | None = False,
|
|
140
|
+
nuclia_jwt_key: str | None = None,
|
|
141
141
|
days_to_keep: int = 3,
|
|
142
142
|
driver: FileBackendConfig = FileBackendConfig.GCS,
|
|
143
143
|
):
|
|
144
144
|
self.nuclia_service_account = nuclia_service_account
|
|
145
145
|
self.nuclia_zone = nuclia_zone
|
|
146
146
|
if nuclia_public_url is not None:
|
|
147
|
-
self.nuclia_public_url:
|
|
147
|
+
self.nuclia_public_url: str | None = nuclia_public_url.format(zone=nuclia_zone)
|
|
148
148
|
else:
|
|
149
149
|
self.nuclia_public_url = None
|
|
150
150
|
|
|
@@ -196,7 +196,7 @@ class ProcessingEngine:
|
|
|
196
196
|
return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
|
|
197
197
|
|
|
198
198
|
def generate_file_token_from_fieldfile(
|
|
199
|
-
self, file: FieldFilePB, classif_labels:
|
|
199
|
+
self, file: FieldFilePB, classif_labels: list[ClassificationLabel] | None = None
|
|
200
200
|
) -> str:
|
|
201
201
|
if self.nuclia_jwt_key is None:
|
|
202
202
|
raise AttributeError("Nuclia JWT key not set")
|
|
@@ -235,7 +235,7 @@ class ProcessingEngine:
|
|
|
235
235
|
)
|
|
236
236
|
@processing_observer.wrap({"type": "file_field_upload"})
|
|
237
237
|
async def convert_filefield_to_str(
|
|
238
|
-
self, file: models.FileField, classif_labels:
|
|
238
|
+
self, file: models.FileField, classif_labels: list[ClassificationLabel] | None = None
|
|
239
239
|
) -> str:
|
|
240
240
|
# Upload file without storing on Nuclia DB
|
|
241
241
|
headers = {}
|
|
@@ -273,7 +273,7 @@ class ProcessingEngine:
|
|
|
273
273
|
).decode()
|
|
274
274
|
|
|
275
275
|
def convert_external_filefield_to_str(
|
|
276
|
-
self, file_field: models.FileField, classif_labels:
|
|
276
|
+
self, file_field: models.FileField, classif_labels: list[ClassificationLabel] | None = None
|
|
277
277
|
) -> str:
|
|
278
278
|
if self.nuclia_jwt_key is None:
|
|
279
279
|
raise AttributeError("Nuclia JWT key not set")
|
|
@@ -313,7 +313,7 @@ class ProcessingEngine:
|
|
|
313
313
|
self,
|
|
314
314
|
file: FieldFilePB,
|
|
315
315
|
storage: Storage,
|
|
316
|
-
classif_labels:
|
|
316
|
+
classif_labels: list[ClassificationLabel] | None = None,
|
|
317
317
|
) -> str:
|
|
318
318
|
"""It's already an internal file that needs to be uploaded"""
|
|
319
319
|
if self.onprem is False:
|
|
@@ -438,7 +438,7 @@ class ProcessingEngine:
|
|
|
438
438
|
queue=QueueType(queue_type) if queue_type is not None else None,
|
|
439
439
|
)
|
|
440
440
|
|
|
441
|
-
async def delete_from_processing(self, *, kbid: str, resource_id:
|
|
441
|
+
async def delete_from_processing(self, *, kbid: str, resource_id: str | None = None) -> None:
|
|
442
442
|
"""
|
|
443
443
|
Delete a resource from processing. This prevents inflight resources from being processed
|
|
444
444
|
and wasting resources.
|
|
@@ -479,7 +479,7 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
|
479
479
|
pass
|
|
480
480
|
|
|
481
481
|
async def convert_filefield_to_str(
|
|
482
|
-
self, file: models.FileField, classif_labels:
|
|
482
|
+
self, file: models.FileField, classif_labels: list[ClassificationLabel] | None = None
|
|
483
483
|
) -> str:
|
|
484
484
|
self.calls.append([file])
|
|
485
485
|
index = len(self.values["convert_filefield_to_str"])
|
|
@@ -487,7 +487,7 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
|
487
487
|
return f"convert_filefield_to_str,{index}"
|
|
488
488
|
|
|
489
489
|
def convert_external_filefield_to_str(
|
|
490
|
-
self, file_field: models.FileField, classif_labels:
|
|
490
|
+
self, file_field: models.FileField, classif_labels: list[ClassificationLabel] | None = None
|
|
491
491
|
) -> str:
|
|
492
492
|
self.calls.append([file_field])
|
|
493
493
|
index = len(self.values["convert_external_filefield_to_str"])
|
|
@@ -498,7 +498,7 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
|
498
498
|
self,
|
|
499
499
|
file: FieldFilePB,
|
|
500
500
|
storage: Storage,
|
|
501
|
-
classif_labels:
|
|
501
|
+
classif_labels: list[ClassificationLabel] | None = None,
|
|
502
502
|
) -> str:
|
|
503
503
|
self.calls.append([file, storage])
|
|
504
504
|
index = len(self.values["convert_internal_filefield_to_str"])
|
|
@@ -516,5 +516,5 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
|
516
516
|
self.values["send_to_process"].append([item, partition])
|
|
517
517
|
return ProcessingInfo(seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED)
|
|
518
518
|
|
|
519
|
-
async def delete_from_processing(self, *, kbid: str, resource_id:
|
|
519
|
+
async def delete_from_processing(self, *, kbid: str, resource_id: str | None = None) -> None:
|
|
520
520
|
self.calls.append([kbid, resource_id])
|