nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/common/ids.py
CHANGED
|
@@ -24,7 +24,6 @@ paragraphs... Avoiding spread of id construction and parsing everywhere
|
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
26
|
from dataclasses import dataclass
|
|
27
|
-
from typing import Optional
|
|
28
27
|
|
|
29
28
|
from nucliadb_models.common import FieldTypeName
|
|
30
29
|
from nucliadb_protos.resources_pb2 import FieldType
|
|
@@ -47,6 +46,8 @@ FIELD_TYPE_NAME_TO_STR = {
|
|
|
47
46
|
FieldTypeName.CONVERSATION: "c",
|
|
48
47
|
}
|
|
49
48
|
|
|
49
|
+
FIELD_TYPE_STR_TO_NAME = {v: k for k, v in FIELD_TYPE_NAME_TO_STR.items()}
|
|
50
|
+
|
|
50
51
|
|
|
51
52
|
@dataclass
|
|
52
53
|
class FieldId:
|
|
@@ -65,7 +66,7 @@ class FieldId:
|
|
|
65
66
|
|
|
66
67
|
Examples:
|
|
67
68
|
|
|
68
|
-
>>> FieldId(rid="rid", type="u", key="
|
|
69
|
+
>>> FieldId(rid="rid", type="u", key="my-link")
|
|
69
70
|
FieldID("rid/u/my-link")
|
|
70
71
|
>>> FieldId.from_string("rid/u/my-link")
|
|
71
72
|
FieldID("rid/u/my-link")
|
|
@@ -75,32 +76,7 @@ class FieldId:
|
|
|
75
76
|
type: str
|
|
76
77
|
key: str
|
|
77
78
|
# also knwon as `split`, this indicates a part of a field in, for example, conversations
|
|
78
|
-
subfield_id:
|
|
79
|
-
|
|
80
|
-
def __repr__(self) -> str:
|
|
81
|
-
return f"FieldId({self.full()})"
|
|
82
|
-
|
|
83
|
-
def short_without_subfield(self) -> str:
|
|
84
|
-
return f"/{self.type}/{self.key}"
|
|
85
|
-
|
|
86
|
-
def full(self) -> str:
|
|
87
|
-
if self.subfield_id is None:
|
|
88
|
-
return f"{self.rid}/{self.type}/{self.key}"
|
|
89
|
-
else:
|
|
90
|
-
return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
|
|
91
|
-
|
|
92
|
-
def __hash__(self) -> int:
|
|
93
|
-
return hash(self.full())
|
|
94
|
-
|
|
95
|
-
@property
|
|
96
|
-
def pb_type(self) -> FieldType.ValueType:
|
|
97
|
-
return FIELD_TYPE_STR_TO_PB[self.type]
|
|
98
|
-
|
|
99
|
-
@classmethod
|
|
100
|
-
def from_pb(
|
|
101
|
-
cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
|
|
102
|
-
) -> "FieldId":
|
|
103
|
-
return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
|
|
79
|
+
subfield_id: str | None = None
|
|
104
80
|
|
|
105
81
|
@classmethod
|
|
106
82
|
def from_string(cls, value: str) -> "FieldId":
|
|
@@ -120,11 +96,11 @@ class FieldId:
|
|
|
120
96
|
parts = value.split("/")
|
|
121
97
|
if len(parts) == 3:
|
|
122
98
|
rid, _type, key = parts
|
|
123
|
-
_type = cls.
|
|
99
|
+
_type = cls._parse_field_type(_type)
|
|
124
100
|
return cls(rid=rid, type=_type, key=key)
|
|
125
101
|
elif len(parts) == 4:
|
|
126
102
|
rid, _type, key, subfield_id = parts
|
|
127
|
-
_type = cls.
|
|
103
|
+
_type = cls._parse_field_type(_type)
|
|
128
104
|
return cls(
|
|
129
105
|
rid=rid,
|
|
130
106
|
type=_type,
|
|
@@ -135,7 +111,49 @@ class FieldId:
|
|
|
135
111
|
raise ValueError(f"Invalid FieldId: {value}")
|
|
136
112
|
|
|
137
113
|
@classmethod
|
|
138
|
-
def
|
|
114
|
+
def from_pb(
|
|
115
|
+
cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: str | None = None
|
|
116
|
+
) -> "FieldId":
|
|
117
|
+
return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def pb_type(self) -> FieldType.ValueType:
|
|
121
|
+
return FIELD_TYPE_STR_TO_PB[self.type]
|
|
122
|
+
|
|
123
|
+
def full(self) -> str:
|
|
124
|
+
if self.subfield_id is None:
|
|
125
|
+
return f"{self.rid}/{self.type}/{self.key}"
|
|
126
|
+
else:
|
|
127
|
+
return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
|
|
128
|
+
|
|
129
|
+
def full_without_subfield(self) -> str:
|
|
130
|
+
return f"{self.rid}/{self.type}/{self.key}"
|
|
131
|
+
|
|
132
|
+
def short_without_subfield(self) -> str:
|
|
133
|
+
return f"/{self.type}/{self.key}"
|
|
134
|
+
|
|
135
|
+
def paragraph_id(self, paragraph_start: int, paragraph_end: int) -> "ParagraphId":
|
|
136
|
+
"""Generate a ParagraphId from the current field given its start and
|
|
137
|
+
end.
|
|
138
|
+
|
|
139
|
+
"""
|
|
140
|
+
return ParagraphId(
|
|
141
|
+
field_id=self,
|
|
142
|
+
paragraph_start=paragraph_start,
|
|
143
|
+
paragraph_end=paragraph_end,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def __str__(self) -> str:
|
|
147
|
+
return self.full()
|
|
148
|
+
|
|
149
|
+
def __repr__(self) -> str:
|
|
150
|
+
return f"FieldId({self.full()})"
|
|
151
|
+
|
|
152
|
+
def __hash__(self) -> int:
|
|
153
|
+
return hash(self.full())
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def _parse_field_type(_type: str) -> str:
|
|
139
157
|
if _type not in FIELD_TYPE_STR_TO_PB:
|
|
140
158
|
# Try to parse the enum value
|
|
141
159
|
# XXX: This is to support field types that are integer values of FieldType
|
|
@@ -157,19 +175,6 @@ class ParagraphId:
|
|
|
157
175
|
paragraph_start: int
|
|
158
176
|
paragraph_end: int
|
|
159
177
|
|
|
160
|
-
def __repr__(self) -> str:
|
|
161
|
-
return f"ParagraphId({self.full()})"
|
|
162
|
-
|
|
163
|
-
def full(self) -> str:
|
|
164
|
-
return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
|
|
165
|
-
|
|
166
|
-
def __hash__(self) -> int:
|
|
167
|
-
return hash(self.full())
|
|
168
|
-
|
|
169
|
-
@property
|
|
170
|
-
def rid(self) -> str:
|
|
171
|
-
return self.field_id.rid
|
|
172
|
-
|
|
173
178
|
@classmethod
|
|
174
179
|
def from_string(cls, value: str) -> "ParagraphId":
|
|
175
180
|
parts = value.split("/")
|
|
@@ -192,6 +197,22 @@ class ParagraphId:
|
|
|
192
197
|
paragraph_end=vid.vector_end,
|
|
193
198
|
)
|
|
194
199
|
|
|
200
|
+
@property
|
|
201
|
+
def rid(self) -> str:
|
|
202
|
+
return self.field_id.rid
|
|
203
|
+
|
|
204
|
+
def full(self) -> str:
|
|
205
|
+
return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
|
|
206
|
+
|
|
207
|
+
def __str__(self) -> str:
|
|
208
|
+
return self.full()
|
|
209
|
+
|
|
210
|
+
def __repr__(self) -> str:
|
|
211
|
+
return f"ParagraphId({self.full()})"
|
|
212
|
+
|
|
213
|
+
def __hash__(self) -> int:
|
|
214
|
+
return hash(self.full())
|
|
215
|
+
|
|
195
216
|
|
|
196
217
|
@dataclass
|
|
197
218
|
class VectorId:
|
|
@@ -217,19 +238,6 @@ class VectorId:
|
|
|
217
238
|
vector_start: int
|
|
218
239
|
vector_end: int
|
|
219
240
|
|
|
220
|
-
def __repr__(self) -> str:
|
|
221
|
-
return f"VectorId({self.full()})"
|
|
222
|
-
|
|
223
|
-
def full(self) -> str:
|
|
224
|
-
return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
|
|
225
|
-
|
|
226
|
-
def __hash__(self) -> int:
|
|
227
|
-
return hash(self.full())
|
|
228
|
-
|
|
229
|
-
@property
|
|
230
|
-
def rid(self) -> str:
|
|
231
|
-
return self.field_id.rid
|
|
232
|
-
|
|
233
241
|
@classmethod
|
|
234
242
|
def from_string(cls, value: str) -> "VectorId":
|
|
235
243
|
parts = value.split("/")
|
|
@@ -239,8 +247,24 @@ class VectorId:
|
|
|
239
247
|
field_id = FieldId.from_string("/".join(parts[:-2]))
|
|
240
248
|
return cls(field_id=field_id, index=index, vector_start=start, vector_end=end)
|
|
241
249
|
|
|
250
|
+
@property
|
|
251
|
+
def rid(self) -> str:
|
|
252
|
+
return self.field_id.rid
|
|
253
|
+
|
|
254
|
+
def full(self) -> str:
|
|
255
|
+
return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
|
|
256
|
+
|
|
257
|
+
def __str__(self) -> str:
|
|
258
|
+
return self.full()
|
|
259
|
+
|
|
260
|
+
def __repr__(self) -> str:
|
|
261
|
+
return f"VectorId({self.full()})"
|
|
262
|
+
|
|
263
|
+
def __hash__(self) -> int:
|
|
264
|
+
return hash(self.full())
|
|
265
|
+
|
|
242
266
|
|
|
243
|
-
def extract_data_augmentation_id(generated_field_id: str) ->
|
|
267
|
+
def extract_data_augmentation_id(generated_field_id: str) -> str | None:
|
|
244
268
|
"""Data augmentation generated fields have a strict id with the following
|
|
245
269
|
format:
|
|
246
270
|
`da-{task_id}-{original:field_type}-{original:field_id}[-{original:split}]`
|
nucliadb/common/locking.py
CHANGED
|
@@ -22,7 +22,6 @@ import logging
|
|
|
22
22
|
import time
|
|
23
23
|
import uuid
|
|
24
24
|
from dataclasses import dataclass
|
|
25
|
-
from typing import Optional
|
|
26
25
|
|
|
27
26
|
import orjson
|
|
28
27
|
|
|
@@ -99,7 +98,7 @@ class _Lock:
|
|
|
99
98
|
self.task = asyncio.create_task(self._refresh_task())
|
|
100
99
|
return self
|
|
101
100
|
|
|
102
|
-
async def get_lock_data(self, txn: Transaction) ->
|
|
101
|
+
async def get_lock_data(self, txn: Transaction) -> LockValue | None:
|
|
103
102
|
existing_data = await txn.get(self.key, for_update=True)
|
|
104
103
|
if existing_data is None:
|
|
105
104
|
return None
|
nucliadb/common/maindb/driver.py
CHANGED
|
@@ -20,8 +20,9 @@
|
|
|
20
20
|
from __future__ import annotations
|
|
21
21
|
|
|
22
22
|
import asyncio
|
|
23
|
+
from collections.abc import AsyncGenerator
|
|
23
24
|
from contextlib import asynccontextmanager
|
|
24
|
-
from typing import
|
|
25
|
+
from typing import ClassVar
|
|
25
26
|
|
|
26
27
|
DEFAULT_SCAN_LIMIT = -1
|
|
27
28
|
DEFAULT_BATCH_SCAN_LIMIT = 500
|
|
@@ -37,10 +38,10 @@ class Transaction:
|
|
|
37
38
|
async def commit(self):
|
|
38
39
|
raise NotImplementedError()
|
|
39
40
|
|
|
40
|
-
async def batch_get(self, keys: list[str], for_update: bool = False) -> list[
|
|
41
|
+
async def batch_get(self, keys: list[str], for_update: bool = False) -> list[bytes | None]:
|
|
41
42
|
raise NotImplementedError()
|
|
42
43
|
|
|
43
|
-
async def get(self, key: str, for_update: bool = False) ->
|
|
44
|
+
async def get(self, key: str, for_update: bool = False) -> bytes | None:
|
|
44
45
|
raise NotImplementedError()
|
|
45
46
|
|
|
46
47
|
async def set(self, key: str, value: bytes):
|
|
@@ -57,7 +58,7 @@ class Transaction:
|
|
|
57
58
|
|
|
58
59
|
def keys(
|
|
59
60
|
self, match: str, count: int = DEFAULT_SCAN_LIMIT, include_start: bool = True
|
|
60
|
-
) -> AsyncGenerator[str
|
|
61
|
+
) -> AsyncGenerator[str]:
|
|
61
62
|
raise NotImplementedError()
|
|
62
63
|
|
|
63
64
|
async def count(self, match: str) -> int:
|
|
@@ -66,7 +67,7 @@ class Transaction:
|
|
|
66
67
|
|
|
67
68
|
class Driver:
|
|
68
69
|
initialized = False
|
|
69
|
-
_abort_tasks: list[asyncio.Task] = []
|
|
70
|
+
_abort_tasks: ClassVar[list[asyncio.Task]] = []
|
|
70
71
|
|
|
71
72
|
async def initialize(self):
|
|
72
73
|
raise NotImplementedError()
|
|
@@ -81,15 +82,15 @@ class Driver:
|
|
|
81
82
|
pass
|
|
82
83
|
|
|
83
84
|
@asynccontextmanager
|
|
84
|
-
async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction
|
|
85
|
+
async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction]:
|
|
85
86
|
yield Transaction()
|
|
86
87
|
|
|
87
88
|
@asynccontextmanager
|
|
88
|
-
async def ro_transaction(self) -> AsyncGenerator[Transaction
|
|
89
|
+
async def ro_transaction(self) -> AsyncGenerator[Transaction]:
|
|
89
90
|
async with self._transaction(read_only=True) as txn:
|
|
90
91
|
yield txn
|
|
91
92
|
|
|
92
93
|
@asynccontextmanager
|
|
93
|
-
async def rw_transaction(self) -> AsyncGenerator[Transaction
|
|
94
|
+
async def rw_transaction(self) -> AsyncGenerator[Transaction]:
|
|
94
95
|
async with self._transaction(read_only=False) as txn:
|
|
95
96
|
yield txn
|
nucliadb/common/maindb/local.py
CHANGED
|
@@ -19,8 +19,8 @@
|
|
|
19
19
|
#
|
|
20
20
|
import glob
|
|
21
21
|
import os
|
|
22
|
+
from collections.abc import AsyncGenerator
|
|
22
23
|
from contextlib import asynccontextmanager
|
|
23
|
-
from typing import AsyncGenerator, Optional
|
|
24
24
|
|
|
25
25
|
from nucliadb.common.maindb.driver import (
|
|
26
26
|
DEFAULT_BATCH_SCAN_LIMIT,
|
|
@@ -78,7 +78,7 @@ class LocalTransaction(Transaction):
|
|
|
78
78
|
# Deleting a key that does not exist
|
|
79
79
|
pass
|
|
80
80
|
|
|
81
|
-
async def read(self, key: str) ->
|
|
81
|
+
async def read(self, key: str) -> bytes | None:
|
|
82
82
|
try:
|
|
83
83
|
async with aiofiles.open(self.compute_path(key), "rb") as resp:
|
|
84
84
|
return await resp.read()
|
|
@@ -106,8 +106,8 @@ class LocalTransaction(Transaction):
|
|
|
106
106
|
self.clean()
|
|
107
107
|
self.open = False
|
|
108
108
|
|
|
109
|
-
async def batch_get(self, keys: list[str], for_update: bool = False) -> list[
|
|
110
|
-
results: list[
|
|
109
|
+
async def batch_get(self, keys: list[str], for_update: bool = False) -> list[bytes | None]:
|
|
110
|
+
results: list[bytes | None] = []
|
|
111
111
|
for key in keys:
|
|
112
112
|
obj = await self.get(key)
|
|
113
113
|
if obj:
|
|
@@ -125,7 +125,7 @@ class LocalTransaction(Transaction):
|
|
|
125
125
|
|
|
126
126
|
return results
|
|
127
127
|
|
|
128
|
-
async def get(self, key: str, for_update: bool = False) ->
|
|
128
|
+
async def get(self, key: str, for_update: bool = False) -> bytes | None:
|
|
129
129
|
if key in self.deleted_keys:
|
|
130
130
|
raise KeyError(f"Not found {key}")
|
|
131
131
|
|
nucliadb/common/maindb/pg.py
CHANGED
|
@@ -21,8 +21,9 @@ from __future__ import annotations
|
|
|
21
21
|
|
|
22
22
|
import asyncio
|
|
23
23
|
import logging
|
|
24
|
+
from collections.abc import AsyncGenerator
|
|
24
25
|
from contextlib import asynccontextmanager
|
|
25
|
-
from typing import Any
|
|
26
|
+
from typing import Any
|
|
26
27
|
|
|
27
28
|
import backoff
|
|
28
29
|
import psycopg
|
|
@@ -72,7 +73,7 @@ class DataLayer:
|
|
|
72
73
|
self.connection = connection
|
|
73
74
|
self.log_on_select_for_update = settings.driver_pg_log_on_select_for_update
|
|
74
75
|
|
|
75
|
-
async def get(self, key: str, select_for_update: bool = False) ->
|
|
76
|
+
async def get(self, key: str, select_for_update: bool = False) -> bytes | None:
|
|
76
77
|
with pg_observer({"type": "get"}):
|
|
77
78
|
statement = "SELECT value FROM resources WHERE key = %s"
|
|
78
79
|
if select_for_update:
|
|
@@ -116,7 +117,7 @@ class DataLayer:
|
|
|
116
117
|
async with self.connection.cursor() as cur:
|
|
117
118
|
await cur.execute("DELETE FROM resources WHERE key LIKE %s", (prefix + "%",))
|
|
118
119
|
|
|
119
|
-
async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[
|
|
120
|
+
async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[bytes | None]:
|
|
120
121
|
with pg_observer({"type": "batch_get"}):
|
|
121
122
|
async with self.connection.cursor() as cur:
|
|
122
123
|
statement = "SELECT key, value FROM resources WHERE key = ANY(%s)"
|
|
@@ -134,7 +135,7 @@ class DataLayer:
|
|
|
134
135
|
prefix: str,
|
|
135
136
|
limit: int = DEFAULT_SCAN_LIMIT,
|
|
136
137
|
include_start: bool = True,
|
|
137
|
-
) -> AsyncGenerator[str
|
|
138
|
+
) -> AsyncGenerator[str]:
|
|
138
139
|
query = "SELECT key FROM resources WHERE key LIKE %s ORDER BY key"
|
|
139
140
|
|
|
140
141
|
args: list[Any] = [prefix + "%"]
|
|
@@ -190,7 +191,7 @@ class PGTransaction(Transaction):
|
|
|
190
191
|
async def batch_get(self, keys: list[str], for_update: bool = True):
|
|
191
192
|
return await self.data_layer.batch_get(keys, select_for_update=for_update)
|
|
192
193
|
|
|
193
|
-
async def get(self, key: str, for_update: bool = True) ->
|
|
194
|
+
async def get(self, key: str, for_update: bool = True) -> bytes | None:
|
|
194
195
|
return await self.data_layer.get(key, select_for_update=for_update)
|
|
195
196
|
|
|
196
197
|
async def set(self, key: str, value: bytes):
|
|
@@ -243,7 +244,7 @@ class ReadOnlyPGTransaction(Transaction):
|
|
|
243
244
|
return await DataLayer(conn).batch_get(keys, select_for_update=False)
|
|
244
245
|
|
|
245
246
|
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
|
246
|
-
async def get(self, key: str, for_update: bool = False) ->
|
|
247
|
+
async def get(self, key: str, for_update: bool = False) -> bytes | None:
|
|
247
248
|
async with self.driver._get_connection() as conn:
|
|
248
249
|
return await DataLayer(conn).get(key, select_for_update=False)
|
|
249
250
|
|
|
@@ -330,7 +331,7 @@ class PGDriver(Driver):
|
|
|
330
331
|
metric.set(value)
|
|
331
332
|
|
|
332
333
|
@asynccontextmanager
|
|
333
|
-
async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction
|
|
334
|
+
async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction]:
|
|
334
335
|
if read_only:
|
|
335
336
|
yield ReadOnlyPGTransaction(self)
|
|
336
337
|
else:
|
|
@@ -343,7 +344,7 @@ class PGDriver(Driver):
|
|
|
343
344
|
await txn.abort()
|
|
344
345
|
|
|
345
346
|
@asynccontextmanager
|
|
346
|
-
async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection
|
|
347
|
+
async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection]:
|
|
347
348
|
timeout = self.acquire_timeout_ms / 1000
|
|
348
349
|
# Manual retry loop since backoff.on_exception does not play well with async context managers
|
|
349
350
|
retries = 0
|
nucliadb/common/nidx.py
CHANGED
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
|
|
21
21
|
import os
|
|
22
|
-
from typing import Optional, Union
|
|
23
22
|
|
|
24
23
|
from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxIndexerStub, NidxSearcherStub
|
|
25
24
|
from nidx_protos.nodewriter_pb2 import (
|
|
@@ -54,7 +53,7 @@ class NidxUtility:
|
|
|
54
53
|
pass
|
|
55
54
|
|
|
56
55
|
|
|
57
|
-
def _storage_config(prefix: str, bucket:
|
|
56
|
+
def _storage_config(prefix: str, bucket: str | None) -> dict[str, str]:
|
|
58
57
|
config = {}
|
|
59
58
|
if storage_settings.file_backend == FileBackendConfig.LOCAL:
|
|
60
59
|
local_bucket = bucket or storage_settings.local_indexing_bucket
|
|
@@ -82,6 +81,24 @@ def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
|
|
|
82
81
|
config[f"{prefix}__REGION_NAME"] = storage_settings.s3_region_name or ""
|
|
83
82
|
if storage_settings.s3_endpoint:
|
|
84
83
|
config[f"{prefix}__ENDPOINT"] = storage_settings.s3_endpoint
|
|
84
|
+
elif storage_settings.file_backend == FileBackendConfig.AZURE:
|
|
85
|
+
if storage_settings.azure_account_url is None:
|
|
86
|
+
raise ValueError("Azure account is required")
|
|
87
|
+
config[f"{prefix}__OBJECT_STORE"] = "azure"
|
|
88
|
+
url = storage_settings.azure_account_url
|
|
89
|
+
container = bucket or extended_storage_settings.azure_indexing_bucket
|
|
90
|
+
if container:
|
|
91
|
+
url += f"/{container}"
|
|
92
|
+
config[f"{prefix}__CONTAINER_URL"] = url
|
|
93
|
+
if storage_settings.azure_connection_string:
|
|
94
|
+
params = {
|
|
95
|
+
p.split("=", 1)[0]: p.split("=", 1)[1]
|
|
96
|
+
for p in storage_settings.azure_connection_string.split(";")
|
|
97
|
+
}
|
|
98
|
+
if "AccountKey" in params:
|
|
99
|
+
config[f"{prefix}__ACCOUNT_KEY"] = params["AccountKey"]
|
|
100
|
+
if "BlobEndpoint" in params:
|
|
101
|
+
config[f"{prefix}__ENDPOINT"] = params["BlobEndpoint"]
|
|
85
102
|
|
|
86
103
|
return config
|
|
87
104
|
|
|
@@ -143,7 +160,7 @@ class NidxNatsIndexer:
|
|
|
143
160
|
async def index(self, writer: IndexMessage) -> int:
|
|
144
161
|
res = await self.nats_connection_manager.js.publish(self.subject, writer.SerializeToString())
|
|
145
162
|
logger.info(
|
|
146
|
-
f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid} seqid: {res.seq}"
|
|
163
|
+
f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid} seqid: {res.seq}"
|
|
147
164
|
)
|
|
148
165
|
return res.seq
|
|
149
166
|
|
|
@@ -167,7 +184,7 @@ class NidxGrpcIndexer:
|
|
|
167
184
|
class NidxServiceUtility(NidxUtility):
|
|
168
185
|
"""Implements Nidx utility connecting to the network service"""
|
|
169
186
|
|
|
170
|
-
indexer:
|
|
187
|
+
indexer: NidxNatsIndexer | NidxGrpcIndexer
|
|
171
188
|
|
|
172
189
|
def __init__(self, service_name: str):
|
|
173
190
|
self.service_name = service_name
|
|
@@ -198,7 +215,7 @@ class NidxServiceUtility(NidxUtility):
|
|
|
198
215
|
return await self.indexer.index(writer)
|
|
199
216
|
|
|
200
217
|
|
|
201
|
-
async def start_nidx_utility(service_name: str = "nucliadb.nidx") ->
|
|
218
|
+
async def start_nidx_utility(service_name: str = "nucliadb.nidx") -> NidxUtility:
|
|
202
219
|
nidx = get_utility(Utility.NIDX)
|
|
203
220
|
if nidx:
|
|
204
221
|
return nidx
|
|
@@ -26,7 +26,7 @@ from nucliadb_protos import knowledgebox_pb2 as Nucliadb
|
|
|
26
26
|
def nucliadb_vector_type_to_nidx(nucliadb: Nucliadb.VectorType.ValueType) -> Nidx.VectorType.ValueType:
|
|
27
27
|
if nucliadb == Nucliadb.DENSE_F32:
|
|
28
28
|
return Nidx.DENSE_F32
|
|
29
|
-
else: # pragma:
|
|
29
|
+
else: # pragma: no cover
|
|
30
30
|
raise Exception("Unknown vector type")
|
|
31
31
|
|
|
32
32
|
|
|
@@ -18,8 +18,9 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import json
|
|
21
|
+
from collections.abc import AsyncGenerator
|
|
21
22
|
from datetime import datetime, timezone
|
|
22
|
-
from typing import
|
|
23
|
+
from typing import Type, cast
|
|
23
24
|
|
|
24
25
|
from nucliadb.common.maindb.driver import Driver
|
|
25
26
|
from nucliadb.export_import import logger
|
|
@@ -34,7 +35,7 @@ MAINDB_IMPORT_KEY = "/kbs/{kbid}/imports/{id}"
|
|
|
34
35
|
STORAGE_EXPORT_KEY = "exports/{export_id}"
|
|
35
36
|
STORAGE_IMPORT_KEY = "imports/{import_id}"
|
|
36
37
|
|
|
37
|
-
Metadata =
|
|
38
|
+
Metadata = ExportMetadata | ImportMetadata
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
class ExportImportDataManager:
|
|
@@ -59,7 +60,7 @@ class ExportImportDataManager:
|
|
|
59
60
|
if data is None or data == b"":
|
|
60
61
|
raise MetadataNotFound()
|
|
61
62
|
decoded = data.decode("utf-8")
|
|
62
|
-
model_type:
|
|
63
|
+
model_type: Type[ExportMetadata] | Type[ImportMetadata]
|
|
63
64
|
if type == "export":
|
|
64
65
|
model_type = ExportMetadata
|
|
65
66
|
elif type == "import":
|
|
@@ -18,11 +18,12 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
|
|
21
|
-
from
|
|
21
|
+
from collections.abc import AsyncGenerator
|
|
22
22
|
|
|
23
23
|
from nucliadb.common.context import ApplicationContext
|
|
24
24
|
from nucliadb.export_import import logger
|
|
25
25
|
from nucliadb.export_import.datamanager import ExportImportDataManager
|
|
26
|
+
from nucliadb.export_import.exceptions import MetadataNotFound
|
|
26
27
|
from nucliadb.export_import.models import (
|
|
27
28
|
ExportedItemType,
|
|
28
29
|
ExportMetadata,
|
|
@@ -33,7 +34,6 @@ from nucliadb.export_import.utils import (
|
|
|
33
34
|
download_binary,
|
|
34
35
|
get_broker_message,
|
|
35
36
|
get_cloud_files,
|
|
36
|
-
get_entities,
|
|
37
37
|
get_labels,
|
|
38
38
|
get_learning_config,
|
|
39
39
|
iter_kb_resource_uuids,
|
|
@@ -43,7 +43,7 @@ from nucliadb_telemetry import errors
|
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
async def export_kb(
|
|
46
|
-
context: ApplicationContext, kbid: str, metadata:
|
|
46
|
+
context: ApplicationContext, kbid: str, metadata: ExportMetadata | None = None
|
|
47
47
|
) -> AsyncGenerator[bytes, None]:
|
|
48
48
|
"""Export the data of a knowledgebox to a stream of bytes.
|
|
49
49
|
|
|
@@ -63,9 +63,6 @@ async def export_kb(
|
|
|
63
63
|
async for chunk in resources_iterator:
|
|
64
64
|
yield chunk
|
|
65
65
|
|
|
66
|
-
async for chunk in export_entities(context, kbid):
|
|
67
|
-
yield chunk
|
|
68
|
-
|
|
69
66
|
async for chunk in export_labels(context, kbid):
|
|
70
67
|
yield chunk
|
|
71
68
|
|
|
@@ -76,7 +73,14 @@ async def export_kb_to_blob_storage(context: ApplicationContext, msg: NatsTaskMe
|
|
|
76
73
|
"""
|
|
77
74
|
kbid, export_id = msg.kbid, msg.id
|
|
78
75
|
dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
|
|
79
|
-
|
|
76
|
+
try:
|
|
77
|
+
metadata = await dm.get_export_metadata(kbid=kbid, id=export_id)
|
|
78
|
+
except MetadataNotFound: # pragma: no cover
|
|
79
|
+
logger.error(
|
|
80
|
+
"Export metadata not found. Skipping export.", extra={"kbid": kbid, "export_id": export_id}
|
|
81
|
+
)
|
|
82
|
+
return
|
|
83
|
+
|
|
80
84
|
iterator = export_kb(context, kbid, metadata)
|
|
81
85
|
|
|
82
86
|
retry_handler = TaskRetryHandler("export", dm, metadata)
|
|
@@ -167,18 +171,6 @@ async def export_resource_with_binaries(
|
|
|
167
171
|
yield bm_bytes
|
|
168
172
|
|
|
169
173
|
|
|
170
|
-
async def export_entities(
|
|
171
|
-
context: ApplicationContext,
|
|
172
|
-
kbid: str,
|
|
173
|
-
) -> AsyncGenerator[bytes, None]:
|
|
174
|
-
entities = await get_entities(context, kbid)
|
|
175
|
-
if len(entities.entities_groups) > 0:
|
|
176
|
-
data = entities.SerializeToString()
|
|
177
|
-
yield ExportedItemType.ENTITIES.encode("utf-8")
|
|
178
|
-
yield len(data).to_bytes(4, byteorder="big")
|
|
179
|
-
yield data
|
|
180
|
-
|
|
181
|
-
|
|
182
174
|
async def export_labels(
|
|
183
175
|
context: ApplicationContext,
|
|
184
176
|
kbid: str,
|
|
@@ -17,11 +17,13 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from
|
|
20
|
+
from collections.abc import AsyncGenerator, Callable
|
|
21
|
+
from typing import cast
|
|
21
22
|
|
|
22
23
|
from nucliadb.common.context import ApplicationContext
|
|
23
24
|
from nucliadb.export_import import logger
|
|
24
25
|
from nucliadb.export_import.datamanager import ExportImportDataManager
|
|
26
|
+
from nucliadb.export_import.exceptions import MetadataNotFound
|
|
25
27
|
from nucliadb.export_import.models import (
|
|
26
28
|
ExportedItemType,
|
|
27
29
|
ImportMetadata,
|
|
@@ -32,7 +34,6 @@ from nucliadb.export_import.utils import (
|
|
|
32
34
|
TaskRetryHandler,
|
|
33
35
|
import_binary,
|
|
34
36
|
restore_broker_message,
|
|
35
|
-
set_entities_groups,
|
|
36
37
|
set_labels,
|
|
37
38
|
)
|
|
38
39
|
from nucliadb_protos import knowledgebox_pb2 as kb_pb2
|
|
@@ -46,7 +47,7 @@ async def import_kb(
|
|
|
46
47
|
context: ApplicationContext,
|
|
47
48
|
kbid: str,
|
|
48
49
|
stream: AsyncGenerator[bytes, None],
|
|
49
|
-
metadata:
|
|
50
|
+
metadata: ImportMetadata | None = None,
|
|
50
51
|
) -> None:
|
|
51
52
|
"""
|
|
52
53
|
Imports exported data from a stream into a knowledgebox.
|
|
@@ -72,8 +73,8 @@ async def import_kb(
|
|
|
72
73
|
await import_binary(context, kbid, cf, binary_generator)
|
|
73
74
|
|
|
74
75
|
elif item_type == ExportedItemType.ENTITIES:
|
|
75
|
-
|
|
76
|
-
|
|
76
|
+
# This is not supported anymore, we ignore it if we find it in and old backup
|
|
77
|
+
pass
|
|
77
78
|
|
|
78
79
|
elif item_type == ExportedItemType.LABELS:
|
|
79
80
|
labels = cast(kb_pb2.Labels, data)
|
|
@@ -99,7 +100,13 @@ async def import_kb_from_blob_storage(context: ApplicationContext, msg: NatsTask
|
|
|
99
100
|
"""
|
|
100
101
|
kbid, import_id = msg.kbid, msg.id
|
|
101
102
|
dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
|
|
102
|
-
|
|
103
|
+
try:
|
|
104
|
+
metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
|
|
105
|
+
except MetadataNotFound: # pragma: no cover
|
|
106
|
+
logger.error(
|
|
107
|
+
"Import metadata not found. Skipping import.", extra={"kbid": kbid, "import_id": import_id}
|
|
108
|
+
)
|
|
109
|
+
return
|
|
103
110
|
|
|
104
111
|
retry_handler = TaskRetryHandler("import", dm, metadata)
|
|
105
112
|
|
nucliadb/export_import/tasks.py
CHANGED
|
@@ -56,6 +56,7 @@ def get_exports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
|
|
|
56
56
|
callback=export_kb_to_blob_storage,
|
|
57
57
|
msg_type=NatsTaskMessage,
|
|
58
58
|
max_concurrent_messages=10,
|
|
59
|
+
max_retries=100,
|
|
59
60
|
)
|
|
60
61
|
|
|
61
62
|
|
|
@@ -77,6 +78,7 @@ def get_imports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
|
|
|
77
78
|
callback=import_kb_from_blob_storage,
|
|
78
79
|
msg_type=NatsTaskMessage,
|
|
79
80
|
max_concurrent_messages=10,
|
|
81
|
+
max_retries=100,
|
|
80
82
|
)
|
|
81
83
|
|
|
82
84
|
|