nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/ingest/processing.py
CHANGED
|
@@ -25,7 +25,7 @@ import uuid
|
|
|
25
25
|
from collections import defaultdict
|
|
26
26
|
from contextlib import AsyncExitStack
|
|
27
27
|
from enum import Enum
|
|
28
|
-
from typing import Any
|
|
28
|
+
from typing import Any
|
|
29
29
|
|
|
30
30
|
import aiohttp
|
|
31
31
|
import backoff
|
|
@@ -132,19 +132,19 @@ def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> Process
|
|
|
132
132
|
class ProcessingEngine:
|
|
133
133
|
def __init__(
|
|
134
134
|
self,
|
|
135
|
-
nuclia_service_account:
|
|
136
|
-
nuclia_zone:
|
|
137
|
-
nuclia_public_url:
|
|
138
|
-
nuclia_processing_cluster_url:
|
|
139
|
-
onprem:
|
|
140
|
-
nuclia_jwt_key:
|
|
135
|
+
nuclia_service_account: str | None = None,
|
|
136
|
+
nuclia_zone: str | None = None,
|
|
137
|
+
nuclia_public_url: str | None = None,
|
|
138
|
+
nuclia_processing_cluster_url: str | None = None,
|
|
139
|
+
onprem: bool | None = False,
|
|
140
|
+
nuclia_jwt_key: str | None = None,
|
|
141
141
|
days_to_keep: int = 3,
|
|
142
142
|
driver: FileBackendConfig = FileBackendConfig.GCS,
|
|
143
143
|
):
|
|
144
144
|
self.nuclia_service_account = nuclia_service_account
|
|
145
145
|
self.nuclia_zone = nuclia_zone
|
|
146
146
|
if nuclia_public_url is not None:
|
|
147
|
-
self.nuclia_public_url:
|
|
147
|
+
self.nuclia_public_url: str | None = nuclia_public_url.format(zone=nuclia_zone)
|
|
148
148
|
else:
|
|
149
149
|
self.nuclia_public_url = None
|
|
150
150
|
|
|
@@ -196,7 +196,7 @@ class ProcessingEngine:
|
|
|
196
196
|
return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
|
|
197
197
|
|
|
198
198
|
def generate_file_token_from_fieldfile(
|
|
199
|
-
self, file: FieldFilePB, classif_labels:
|
|
199
|
+
self, file: FieldFilePB, classif_labels: list[ClassificationLabel] | None = None
|
|
200
200
|
) -> str:
|
|
201
201
|
if self.nuclia_jwt_key is None:
|
|
202
202
|
raise AttributeError("Nuclia JWT key not set")
|
|
@@ -235,7 +235,7 @@ class ProcessingEngine:
|
|
|
235
235
|
)
|
|
236
236
|
@processing_observer.wrap({"type": "file_field_upload"})
|
|
237
237
|
async def convert_filefield_to_str(
|
|
238
|
-
self, file: models.FileField, classif_labels:
|
|
238
|
+
self, file: models.FileField, classif_labels: list[ClassificationLabel] | None = None
|
|
239
239
|
) -> str:
|
|
240
240
|
# Upload file without storing on Nuclia DB
|
|
241
241
|
headers = {}
|
|
@@ -273,7 +273,7 @@ class ProcessingEngine:
|
|
|
273
273
|
).decode()
|
|
274
274
|
|
|
275
275
|
def convert_external_filefield_to_str(
|
|
276
|
-
self, file_field: models.FileField, classif_labels:
|
|
276
|
+
self, file_field: models.FileField, classif_labels: list[ClassificationLabel] | None = None
|
|
277
277
|
) -> str:
|
|
278
278
|
if self.nuclia_jwt_key is None:
|
|
279
279
|
raise AttributeError("Nuclia JWT key not set")
|
|
@@ -313,7 +313,7 @@ class ProcessingEngine:
|
|
|
313
313
|
self,
|
|
314
314
|
file: FieldFilePB,
|
|
315
315
|
storage: Storage,
|
|
316
|
-
classif_labels:
|
|
316
|
+
classif_labels: list[ClassificationLabel] | None = None,
|
|
317
317
|
) -> str:
|
|
318
318
|
"""It's already an internal file that needs to be uploaded"""
|
|
319
319
|
if self.onprem is False:
|
|
@@ -438,7 +438,7 @@ class ProcessingEngine:
|
|
|
438
438
|
queue=QueueType(queue_type) if queue_type is not None else None,
|
|
439
439
|
)
|
|
440
440
|
|
|
441
|
-
async def delete_from_processing(self, *, kbid: str, resource_id:
|
|
441
|
+
async def delete_from_processing(self, *, kbid: str, resource_id: str | None = None) -> None:
|
|
442
442
|
"""
|
|
443
443
|
Delete a resource from processing. This prevents inflight resources from being processed
|
|
444
444
|
and wasting resources.
|
|
@@ -479,7 +479,7 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
|
479
479
|
pass
|
|
480
480
|
|
|
481
481
|
async def convert_filefield_to_str(
|
|
482
|
-
self, file: models.FileField, classif_labels:
|
|
482
|
+
self, file: models.FileField, classif_labels: list[ClassificationLabel] | None = None
|
|
483
483
|
) -> str:
|
|
484
484
|
self.calls.append([file])
|
|
485
485
|
index = len(self.values["convert_filefield_to_str"])
|
|
@@ -487,7 +487,7 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
|
487
487
|
return f"convert_filefield_to_str,{index}"
|
|
488
488
|
|
|
489
489
|
def convert_external_filefield_to_str(
|
|
490
|
-
self, file_field: models.FileField, classif_labels:
|
|
490
|
+
self, file_field: models.FileField, classif_labels: list[ClassificationLabel] | None = None
|
|
491
491
|
) -> str:
|
|
492
492
|
self.calls.append([file_field])
|
|
493
493
|
index = len(self.values["convert_external_filefield_to_str"])
|
|
@@ -498,7 +498,7 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
|
498
498
|
self,
|
|
499
499
|
file: FieldFilePB,
|
|
500
500
|
storage: Storage,
|
|
501
|
-
classif_labels:
|
|
501
|
+
classif_labels: list[ClassificationLabel] | None = None,
|
|
502
502
|
) -> str:
|
|
503
503
|
self.calls.append([file, storage])
|
|
504
504
|
index = len(self.values["convert_internal_filefield_to_str"])
|
|
@@ -516,5 +516,5 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
|
516
516
|
self.values["send_to_process"].append([item, partition])
|
|
517
517
|
return ProcessingInfo(seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED)
|
|
518
518
|
|
|
519
|
-
async def delete_from_processing(self, *, kbid: str, resource_id:
|
|
519
|
+
async def delete_from_processing(self, *, kbid: str, resource_id: str | None = None) -> None:
|
|
520
520
|
self.calls.append([kbid, resource_id])
|
nucliadb/ingest/serialize.py
CHANGED
|
@@ -18,7 +18,6 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
|
|
21
|
-
from typing import Optional, Union
|
|
22
21
|
|
|
23
22
|
import nucliadb_models as models
|
|
24
23
|
from nucliadb.common import datamanagers
|
|
@@ -32,15 +31,22 @@ from nucliadb.ingest.fields.link import Link
|
|
|
32
31
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
|
33
32
|
from nucliadb.ingest.orm.resource import Resource as ORMResource
|
|
34
33
|
from nucliadb_models.common import FieldTypeName
|
|
34
|
+
from nucliadb_models.metadata import Extra, Origin, Relation
|
|
35
35
|
from nucliadb_models.resource import (
|
|
36
36
|
ConversationFieldData,
|
|
37
37
|
ConversationFieldExtractedData,
|
|
38
38
|
Error,
|
|
39
39
|
ExtractedDataType,
|
|
40
40
|
ExtractedDataTypeName,
|
|
41
|
+
ExtractedText,
|
|
42
|
+
FieldComputedMetadata,
|
|
43
|
+
FieldQuestionAnswers,
|
|
44
|
+
FileExtractedData,
|
|
41
45
|
FileFieldData,
|
|
42
46
|
FileFieldExtractedData,
|
|
43
47
|
GenericFieldData,
|
|
48
|
+
LargeComputedMetadata,
|
|
49
|
+
LinkExtractedData,
|
|
44
50
|
LinkFieldData,
|
|
45
51
|
LinkFieldExtractedData,
|
|
46
52
|
QueueType,
|
|
@@ -48,6 +54,7 @@ from nucliadb_models.resource import (
|
|
|
48
54
|
ResourceData,
|
|
49
55
|
TextFieldData,
|
|
50
56
|
TextFieldExtractedData,
|
|
57
|
+
VectorObject,
|
|
51
58
|
)
|
|
52
59
|
from nucliadb_models.search import ResourceProperties
|
|
53
60
|
from nucliadb_models.security import ResourceSecurity
|
|
@@ -56,84 +63,15 @@ from nucliadb_protos.writer_pb2 import FieldStatus
|
|
|
56
63
|
from nucliadb_utils.utilities import get_storage
|
|
57
64
|
|
|
58
65
|
|
|
59
|
-
async def set_resource_field_extracted_data(
|
|
60
|
-
field: Field,
|
|
61
|
-
field_data: ExtractedDataType,
|
|
62
|
-
field_type_name: FieldTypeName,
|
|
63
|
-
wanted_extracted_data: list[ExtractedDataTypeName],
|
|
64
|
-
) -> None:
|
|
65
|
-
if field_data is None:
|
|
66
|
-
return
|
|
67
|
-
|
|
68
|
-
if ExtractedDataTypeName.TEXT in wanted_extracted_data:
|
|
69
|
-
data_et = await field.get_extracted_text()
|
|
70
|
-
if data_et is not None:
|
|
71
|
-
field_data.text = from_proto.extracted_text(data_et)
|
|
72
|
-
|
|
73
|
-
metadata_wanted = ExtractedDataTypeName.METADATA in wanted_extracted_data
|
|
74
|
-
shortened_metadata_wanted = ExtractedDataTypeName.SHORTENED_METADATA in wanted_extracted_data
|
|
75
|
-
if metadata_wanted or shortened_metadata_wanted:
|
|
76
|
-
data_fcm = await field.get_field_metadata()
|
|
77
|
-
|
|
78
|
-
if data_fcm is not None:
|
|
79
|
-
field_data.metadata = from_proto.field_computed_metadata(
|
|
80
|
-
data_fcm, shortened=shortened_metadata_wanted and not metadata_wanted
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
if ExtractedDataTypeName.LARGE_METADATA in wanted_extracted_data:
|
|
84
|
-
data_lcm = await field.get_large_field_metadata()
|
|
85
|
-
if data_lcm is not None:
|
|
86
|
-
field_data.large_metadata = from_proto.large_computed_metadata(data_lcm)
|
|
87
|
-
|
|
88
|
-
if ExtractedDataTypeName.VECTOR in wanted_extracted_data:
|
|
89
|
-
# XXX: our extracted API is not vectorset-compatible, so we'll get the
|
|
90
|
-
# first vectorset and return the values. Ideally, we should provide a
|
|
91
|
-
# way to select a vectorset
|
|
92
|
-
vectorset_id = None
|
|
93
|
-
async with datamanagers.with_ro_transaction() as txn:
|
|
94
|
-
async for vectorset_id, vs in datamanagers.vectorsets.iter(
|
|
95
|
-
txn=txn,
|
|
96
|
-
kbid=field.resource.kb.kbid,
|
|
97
|
-
):
|
|
98
|
-
break
|
|
99
|
-
assert vectorset_id is not None, "All KBs must have at least a vectorset"
|
|
100
|
-
data_vec = await field.get_vectors(vectorset_id, vs.storage_key_kind)
|
|
101
|
-
if data_vec is not None:
|
|
102
|
-
field_data.vectors = from_proto.vector_object(data_vec)
|
|
103
|
-
|
|
104
|
-
if ExtractedDataTypeName.QA in wanted_extracted_data:
|
|
105
|
-
qa = await field.get_question_answers()
|
|
106
|
-
if qa is not None:
|
|
107
|
-
field_data.question_answers = from_proto.field_question_answers(qa)
|
|
108
|
-
|
|
109
|
-
if (
|
|
110
|
-
isinstance(field, File)
|
|
111
|
-
and isinstance(field_data, FileFieldExtractedData)
|
|
112
|
-
and ExtractedDataTypeName.FILE in wanted_extracted_data
|
|
113
|
-
):
|
|
114
|
-
data_fed = await field.get_file_extracted_data()
|
|
115
|
-
if data_fed is not None:
|
|
116
|
-
field_data.file = from_proto.file_extracted_data(data_fed)
|
|
117
|
-
|
|
118
|
-
if (
|
|
119
|
-
isinstance(field, Link)
|
|
120
|
-
and isinstance(field_data, LinkFieldExtractedData)
|
|
121
|
-
and ExtractedDataTypeName.LINK in wanted_extracted_data
|
|
122
|
-
):
|
|
123
|
-
data_led = await field.get_link_extracted_data()
|
|
124
|
-
if data_led is not None:
|
|
125
|
-
field_data.link = from_proto.link_extracted_data(data_led)
|
|
126
|
-
|
|
127
|
-
|
|
128
66
|
async def serialize(
|
|
129
67
|
kbid: str,
|
|
130
|
-
rid:
|
|
68
|
+
rid: str | None,
|
|
131
69
|
show: list[ResourceProperties],
|
|
132
70
|
field_type_filter: list[FieldTypeName],
|
|
133
71
|
extracted: list[ExtractedDataTypeName],
|
|
134
|
-
service_name:
|
|
135
|
-
slug:
|
|
136
|
-
) ->
|
|
72
|
+
service_name: str | None = None,
|
|
73
|
+
slug: str | None = None,
|
|
74
|
+
) -> Resource | None:
|
|
137
75
|
driver = get_driver()
|
|
138
76
|
async with driver.ro_transaction() as txn:
|
|
139
77
|
return await managed_serialize(
|
|
@@ -148,45 +86,56 @@ async def serialize(
|
|
|
148
86
|
)
|
|
149
87
|
|
|
150
88
|
|
|
151
|
-
async def serialize_field_errors(
|
|
152
|
-
field: Field,
|
|
153
|
-
serialized: Union[
|
|
154
|
-
TextFieldData, FileFieldData, LinkFieldData, ConversationFieldData, GenericFieldData
|
|
155
|
-
],
|
|
156
|
-
):
|
|
157
|
-
status = await field.get_status()
|
|
158
|
-
if status is None:
|
|
159
|
-
status = FieldStatus()
|
|
160
|
-
serialized.status = status.Status.Name(status.status)
|
|
161
|
-
if status.errors:
|
|
162
|
-
serialized.errors = []
|
|
163
|
-
for error in status.errors:
|
|
164
|
-
serialized.errors.append(
|
|
165
|
-
Error(
|
|
166
|
-
body=error.source_error.error,
|
|
167
|
-
code=error.source_error.code,
|
|
168
|
-
code_str=writer_pb2.Error.ErrorCode.Name(error.source_error.code),
|
|
169
|
-
created=error.created.ToDatetime(),
|
|
170
|
-
severity=writer_pb2.Error.Severity.Name(error.source_error.severity),
|
|
171
|
-
)
|
|
172
|
-
)
|
|
173
|
-
serialized.error = serialized.errors[-1]
|
|
174
|
-
|
|
175
|
-
|
|
176
89
|
async def managed_serialize(
|
|
177
90
|
txn: Transaction,
|
|
178
91
|
kbid: str,
|
|
179
|
-
rid:
|
|
92
|
+
rid: str | None,
|
|
180
93
|
show: list[ResourceProperties],
|
|
181
94
|
field_type_filter: list[FieldTypeName],
|
|
182
95
|
extracted: list[ExtractedDataTypeName],
|
|
183
|
-
service_name:
|
|
184
|
-
slug:
|
|
185
|
-
) ->
|
|
96
|
+
service_name: str | None = None,
|
|
97
|
+
slug: str | None = None,
|
|
98
|
+
) -> Resource | None:
|
|
186
99
|
orm_resource = await get_orm_resource(txn, kbid, rid=rid, slug=slug, service_name=service_name)
|
|
187
100
|
if orm_resource is None:
|
|
188
101
|
return None
|
|
189
102
|
|
|
103
|
+
return await serialize_resource(orm_resource, show, field_type_filter, extracted)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
async def get_orm_resource(
|
|
107
|
+
txn: Transaction,
|
|
108
|
+
kbid: str,
|
|
109
|
+
rid: str | None,
|
|
110
|
+
slug: str | None = None,
|
|
111
|
+
service_name: str | None = None,
|
|
112
|
+
) -> ORMResource | None:
|
|
113
|
+
storage = await get_storage(service_name=service_name)
|
|
114
|
+
|
|
115
|
+
kb = KnowledgeBox(txn, storage, kbid)
|
|
116
|
+
|
|
117
|
+
if rid is None:
|
|
118
|
+
if slug is None:
|
|
119
|
+
raise ValueError("Either rid or slug parameters should be used")
|
|
120
|
+
|
|
121
|
+
rid = await kb.get_resource_uuid_by_slug(slug)
|
|
122
|
+
if rid is None:
|
|
123
|
+
# Could not find resource uuid from slug
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
orm_resource = await kb.get(rid)
|
|
127
|
+
if orm_resource is None:
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
return orm_resource
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
async def serialize_resource(
|
|
134
|
+
orm_resource: ORMResource,
|
|
135
|
+
show: list[ResourceProperties],
|
|
136
|
+
field_type_filter: list[FieldTypeName],
|
|
137
|
+
extracted: list[ExtractedDataTypeName],
|
|
138
|
+
) -> Resource:
|
|
190
139
|
resource = Resource(id=orm_resource.uuid)
|
|
191
140
|
|
|
192
141
|
include_values = ResourceProperties.VALUES in show
|
|
@@ -230,29 +179,18 @@ async def managed_serialize(
|
|
|
230
179
|
resource.queue = QueueType[orm_resource.basic.QueueType.Name(orm_resource.basic.queue)]
|
|
231
180
|
|
|
232
181
|
if ResourceProperties.RELATIONS in show:
|
|
233
|
-
relations = await orm_resource
|
|
234
|
-
resource.usermetadata.relations = [
|
|
235
|
-
from_proto.relation(rel) for rel in relations.relations
|
|
236
|
-
]
|
|
182
|
+
resource.usermetadata.relations = await serialize_user_relations(orm_resource)
|
|
237
183
|
|
|
238
184
|
if ResourceProperties.ORIGIN in show:
|
|
239
|
-
await orm_resource
|
|
240
|
-
if orm_resource.origin is not None:
|
|
241
|
-
resource.origin = from_proto.origin(orm_resource.origin)
|
|
185
|
+
resource.origin = await serialize_origin(orm_resource)
|
|
242
186
|
|
|
243
187
|
if ResourceProperties.EXTRA in show:
|
|
244
|
-
await orm_resource
|
|
245
|
-
if orm_resource.extra is not None:
|
|
246
|
-
resource.extra = from_proto.extra(orm_resource.extra)
|
|
188
|
+
resource.extra = await serialize_extra(orm_resource)
|
|
247
189
|
|
|
248
190
|
include_errors = ResourceProperties.ERRORS in show
|
|
249
191
|
|
|
250
192
|
if ResourceProperties.SECURITY in show:
|
|
251
|
-
await orm_resource
|
|
252
|
-
resource.security = ResourceSecurity(access_groups=[])
|
|
253
|
-
if orm_resource.security is not None:
|
|
254
|
-
for gid in orm_resource.security.access_groups:
|
|
255
|
-
resource.security.access_groups.append(gid)
|
|
193
|
+
resource.security = await serialize_security(orm_resource)
|
|
256
194
|
|
|
257
195
|
if (field_type_filter and (include_values or include_extracted_data)) or include_errors:
|
|
258
196
|
await orm_resource.get_fields()
|
|
@@ -360,38 +298,157 @@ async def managed_serialize(
|
|
|
360
298
|
return resource
|
|
361
299
|
|
|
362
300
|
|
|
363
|
-
async def
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
slug: Optional[str] = None,
|
|
368
|
-
service_name: Optional[str] = None,
|
|
369
|
-
) -> Optional[ORMResource]:
|
|
370
|
-
storage = await get_storage(service_name=service_name)
|
|
301
|
+
async def serialize_origin(resource: ORMResource) -> Origin | None:
|
|
302
|
+
origin = await resource.get_origin()
|
|
303
|
+
if origin is None:
|
|
304
|
+
return None
|
|
371
305
|
|
|
372
|
-
|
|
306
|
+
return from_proto.origin(origin)
|
|
373
307
|
|
|
374
|
-
if rid is None:
|
|
375
|
-
if slug is None:
|
|
376
|
-
raise ValueError("Either rid or slug parameters should be used")
|
|
377
308
|
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
309
|
+
async def serialize_extra(resource: ORMResource) -> Extra | None:
|
|
310
|
+
extra = await resource.get_extra()
|
|
311
|
+
if extra is None:
|
|
312
|
+
return None
|
|
313
|
+
return from_proto.extra(extra)
|
|
382
314
|
|
|
383
|
-
|
|
384
|
-
|
|
315
|
+
|
|
316
|
+
async def serialize_user_relations(resource: ORMResource) -> list[Relation]:
|
|
317
|
+
relations = await resource.get_user_relations()
|
|
318
|
+
return [from_proto.relation(rel) for rel in relations.relations]
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
async def serialize_security(resource: ORMResource) -> ResourceSecurity:
|
|
322
|
+
security = ResourceSecurity(access_groups=[])
|
|
323
|
+
|
|
324
|
+
security_pb = await resource.get_security()
|
|
325
|
+
if security_pb is not None:
|
|
326
|
+
for gid in security_pb.access_groups:
|
|
327
|
+
security.access_groups.append(gid)
|
|
328
|
+
|
|
329
|
+
return security
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
async def serialize_field_errors(
|
|
333
|
+
field: Field,
|
|
334
|
+
serialized: (
|
|
335
|
+
TextFieldData | FileFieldData | LinkFieldData | ConversationFieldData | GenericFieldData
|
|
336
|
+
),
|
|
337
|
+
):
|
|
338
|
+
status = await field.get_status()
|
|
339
|
+
if status is None:
|
|
340
|
+
status = FieldStatus()
|
|
341
|
+
serialized.status = status.Status.Name(status.status)
|
|
342
|
+
if status.errors:
|
|
343
|
+
serialized.errors = []
|
|
344
|
+
for error in status.errors:
|
|
345
|
+
serialized.errors.append(
|
|
346
|
+
Error(
|
|
347
|
+
body=error.source_error.error,
|
|
348
|
+
code=error.source_error.code,
|
|
349
|
+
code_str=writer_pb2.Error.ErrorCode.Name(error.source_error.code),
|
|
350
|
+
created=error.created.ToDatetime(),
|
|
351
|
+
severity=writer_pb2.Error.Severity.Name(error.source_error.severity),
|
|
352
|
+
)
|
|
353
|
+
)
|
|
354
|
+
serialized.error = serialized.errors[-1]
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
async def set_resource_field_extracted_data(
|
|
358
|
+
field: Field,
|
|
359
|
+
field_data: ExtractedDataType,
|
|
360
|
+
field_type_name: FieldTypeName,
|
|
361
|
+
wanted_extracted_data: list[ExtractedDataTypeName],
|
|
362
|
+
) -> None:
|
|
363
|
+
if field_data is None:
|
|
364
|
+
return
|
|
365
|
+
|
|
366
|
+
if ExtractedDataTypeName.TEXT in wanted_extracted_data:
|
|
367
|
+
field_data.text = await serialize_extracted_text(field)
|
|
368
|
+
|
|
369
|
+
metadata_wanted = ExtractedDataTypeName.METADATA in wanted_extracted_data
|
|
370
|
+
shortened_metadata_wanted = ExtractedDataTypeName.SHORTENED_METADATA in wanted_extracted_data
|
|
371
|
+
if metadata_wanted or shortened_metadata_wanted:
|
|
372
|
+
field_data.metadata = await serialize_extracted_metadata(
|
|
373
|
+
field, shortened=shortened_metadata_wanted and not metadata_wanted
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
if ExtractedDataTypeName.LARGE_METADATA in wanted_extracted_data:
|
|
377
|
+
field_data.large_metadata = await serialize_extracted_large_metadata(field)
|
|
378
|
+
|
|
379
|
+
if ExtractedDataTypeName.VECTOR in wanted_extracted_data:
|
|
380
|
+
field_data.vectors = await serialize_extracted_vectors(field)
|
|
381
|
+
|
|
382
|
+
if ExtractedDataTypeName.QA in wanted_extracted_data:
|
|
383
|
+
field_data.question_answers = await serialize_extracted_question_answers(field)
|
|
384
|
+
|
|
385
|
+
if (
|
|
386
|
+
isinstance(field, File)
|
|
387
|
+
and isinstance(field_data, FileFieldExtractedData)
|
|
388
|
+
and ExtractedDataTypeName.FILE in wanted_extracted_data
|
|
389
|
+
):
|
|
390
|
+
field_data.file = await serialize_file_extracted_data(field)
|
|
391
|
+
|
|
392
|
+
if (
|
|
393
|
+
isinstance(field, Link)
|
|
394
|
+
and isinstance(field_data, LinkFieldExtractedData)
|
|
395
|
+
and ExtractedDataTypeName.LINK in wanted_extracted_data
|
|
396
|
+
):
|
|
397
|
+
field_data.link = await serialize_link_extracted_data(field)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
async def serialize_extracted_text(field: Field) -> ExtractedText | None:
|
|
401
|
+
data_et = await field.get_extracted_text()
|
|
402
|
+
if data_et is None:
|
|
385
403
|
return None
|
|
404
|
+
return from_proto.extracted_text(data_et)
|
|
386
405
|
|
|
387
|
-
return orm_resource
|
|
388
406
|
|
|
407
|
+
async def serialize_extracted_metadata(field: Field, *, shortened: bool) -> FieldComputedMetadata | None:
|
|
408
|
+
data_fcm = await field.get_field_metadata()
|
|
409
|
+
if data_fcm is None:
|
|
410
|
+
return None
|
|
411
|
+
return from_proto.field_computed_metadata(data_fcm, shortened)
|
|
389
412
|
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
413
|
+
|
|
414
|
+
async def serialize_extracted_large_metadata(field: Field) -> LargeComputedMetadata | None:
|
|
415
|
+
data_lcm = await field.get_large_field_metadata()
|
|
416
|
+
if data_lcm is None:
|
|
417
|
+
return None
|
|
418
|
+
return from_proto.large_computed_metadata(data_lcm)
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
async def serialize_extracted_vectors(field: Field) -> VectorObject | None:
|
|
422
|
+
# XXX: our extracted API is not vectorset-compatible, so we'll get the
|
|
423
|
+
# first vectorset and return the values. Ideally, we should provide a
|
|
424
|
+
# way to select a vectorset
|
|
425
|
+
vectorset_id = None
|
|
426
|
+
async with datamanagers.with_ro_transaction() as txn:
|
|
427
|
+
async for vectorset_id, vs in datamanagers.vectorsets.iter(txn=txn, kbid=field.kbid):
|
|
428
|
+
break
|
|
429
|
+
assert vectorset_id is not None, "All KBs must have at least a vectorset"
|
|
430
|
+
data_vec = await field.get_vectors(vectorset_id, vs.storage_key_kind)
|
|
431
|
+
if data_vec is None:
|
|
432
|
+
return None
|
|
433
|
+
return from_proto.vector_object(data_vec)
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
async def serialize_extracted_question_answers(field: Field) -> FieldQuestionAnswers | None:
|
|
437
|
+
qa = await field.get_question_answers()
|
|
438
|
+
if qa is None:
|
|
439
|
+
return None
|
|
440
|
+
return from_proto.field_question_answers(qa)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
async def serialize_file_extracted_data(field: File) -> FileExtractedData | None:
|
|
444
|
+
data_fed = await field.get_file_extracted_data()
|
|
445
|
+
if data_fed is None:
|
|
446
|
+
return None
|
|
447
|
+
return from_proto.file_extracted_data(data_fed)
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
async def serialize_link_extracted_data(field: Link) -> LinkExtractedData | None:
|
|
451
|
+
data_led = await field.get_link_extracted_data()
|
|
452
|
+
if data_led is None:
|
|
453
|
+
return None
|
|
454
|
+
return from_proto.link_extracted_data(data_led)
|