nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/ingest/orm/brain_v2.py
CHANGED
|
@@ -18,9 +18,9 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import logging
|
|
21
|
+
from collections.abc import Iterator
|
|
21
22
|
from copy import deepcopy
|
|
22
23
|
from dataclasses import dataclass
|
|
23
|
-
from typing import Optional
|
|
24
24
|
|
|
25
25
|
from nidx_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
|
|
26
26
|
from nidx_protos.noderesources_pb2 import (
|
|
@@ -81,9 +81,9 @@ class ResourceBrain:
|
|
|
81
81
|
self,
|
|
82
82
|
basic: Basic,
|
|
83
83
|
user_relations: Relations,
|
|
84
|
-
origin:
|
|
85
|
-
previous_processing_status:
|
|
86
|
-
security:
|
|
84
|
+
origin: Origin | None,
|
|
85
|
+
previous_processing_status: Metadata.Status.ValueType | None,
|
|
86
|
+
security: utils_pb2.Security | None,
|
|
87
87
|
) -> None:
|
|
88
88
|
self._set_resource_status(basic, previous_processing_status)
|
|
89
89
|
self._set_resource_dates(basic, origin)
|
|
@@ -97,9 +97,9 @@ class ResourceBrain:
|
|
|
97
97
|
self,
|
|
98
98
|
field_key: str,
|
|
99
99
|
extracted_text: ExtractedText,
|
|
100
|
-
field_computed_metadata:
|
|
101
|
-
basic_user_metadata:
|
|
102
|
-
field_author:
|
|
100
|
+
field_computed_metadata: FieldComputedMetadata | None,
|
|
101
|
+
basic_user_metadata: UserMetadata | None,
|
|
102
|
+
field_author: FieldAuthor | None,
|
|
103
103
|
replace_field: bool,
|
|
104
104
|
skip_index: bool,
|
|
105
105
|
) -> None:
|
|
@@ -122,13 +122,17 @@ class ResourceBrain:
|
|
|
122
122
|
field_key: str,
|
|
123
123
|
extracted_text: ExtractedText,
|
|
124
124
|
replace_field: bool,
|
|
125
|
-
skip_texts:
|
|
125
|
+
skip_texts: bool | None,
|
|
126
126
|
):
|
|
127
127
|
if skip_texts is not None:
|
|
128
128
|
self.brain.skip_texts = skip_texts
|
|
129
|
+
|
|
129
130
|
field_text = extracted_text.text
|
|
130
|
-
|
|
131
|
-
|
|
131
|
+
|
|
132
|
+
for split_id in self.sorted_splits(extracted_text):
|
|
133
|
+
split_text = extracted_text.split_text[split_id]
|
|
134
|
+
field_text += f"{split_text} "
|
|
135
|
+
|
|
132
136
|
self.brain.texts[field_key].text = field_text
|
|
133
137
|
|
|
134
138
|
if replace_field:
|
|
@@ -140,18 +144,16 @@ class ResourceBrain:
|
|
|
140
144
|
def apply_field_labels(
|
|
141
145
|
self,
|
|
142
146
|
field_key: str,
|
|
143
|
-
field_computed_metadata:
|
|
144
|
-
field_author:
|
|
145
|
-
basic_user_metadata:
|
|
147
|
+
field_computed_metadata: FieldComputedMetadata | None,
|
|
148
|
+
field_author: FieldAuthor | None,
|
|
149
|
+
basic_user_metadata: UserMetadata | None = None,
|
|
146
150
|
):
|
|
147
151
|
user_cancelled_labels: set[str] = (
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
]
|
|
154
|
-
)
|
|
152
|
+
{
|
|
153
|
+
f"{classification.labelset}/{classification.label}"
|
|
154
|
+
for classification in basic_user_metadata.classifications
|
|
155
|
+
if classification.cancelled_by_user
|
|
156
|
+
}
|
|
155
157
|
if basic_user_metadata
|
|
156
158
|
else set()
|
|
157
159
|
)
|
|
@@ -212,13 +214,17 @@ class ResourceBrain:
|
|
|
212
214
|
field_key: str,
|
|
213
215
|
field_computed_metadata: FieldComputedMetadata,
|
|
214
216
|
extracted_text: ExtractedText,
|
|
215
|
-
page_positions:
|
|
216
|
-
user_field_metadata:
|
|
217
|
+
page_positions: FilePagePositions | None,
|
|
218
|
+
user_field_metadata: UserFieldMetadata | None,
|
|
217
219
|
replace_field: bool,
|
|
218
|
-
skip_paragraphs_index:
|
|
219
|
-
skip_texts_index:
|
|
220
|
-
append_splits:
|
|
220
|
+
skip_paragraphs_index: bool | None,
|
|
221
|
+
skip_texts_index: bool | None,
|
|
222
|
+
append_splits: set[str] | None = None,
|
|
221
223
|
) -> None:
|
|
224
|
+
"""
|
|
225
|
+
append_splits: when provided, only the splits in this set will be indexed. This is used for conversation appends, to
|
|
226
|
+
avoid reindexing all previous messages of the conversation.
|
|
227
|
+
"""
|
|
222
228
|
# We need to add the extracted text to the texts section of the Resource so that
|
|
223
229
|
# the paragraphs can be indexed
|
|
224
230
|
self.apply_field_text(
|
|
@@ -238,30 +244,41 @@ class ResourceBrain:
|
|
|
238
244
|
append_splits=append_splits,
|
|
239
245
|
)
|
|
240
246
|
|
|
247
|
+
def sorted_splits(self, extracted_text: ExtractedText) -> Iterator[str]:
|
|
248
|
+
yield from sorted(extracted_text.split_text.keys())
|
|
249
|
+
|
|
241
250
|
@observer.wrap({"type": "apply_field_paragraphs"})
|
|
242
251
|
def apply_field_paragraphs(
|
|
243
252
|
self,
|
|
244
253
|
field_key: str,
|
|
245
254
|
field_computed_metadata: FieldComputedMetadata,
|
|
246
255
|
extracted_text: ExtractedText,
|
|
247
|
-
page_positions:
|
|
248
|
-
user_field_metadata:
|
|
256
|
+
page_positions: FilePagePositions | None,
|
|
257
|
+
user_field_metadata: UserFieldMetadata | None,
|
|
249
258
|
replace_field: bool,
|
|
250
|
-
skip_paragraphs:
|
|
251
|
-
append_splits:
|
|
259
|
+
skip_paragraphs: bool | None,
|
|
260
|
+
append_splits: set[str] | None = None,
|
|
252
261
|
) -> None:
|
|
253
262
|
if skip_paragraphs is not None:
|
|
254
263
|
self.brain.skip_paragraphs = skip_paragraphs
|
|
255
264
|
unique_paragraphs: set[str] = set()
|
|
256
265
|
user_paragraph_classifications = self._get_paragraph_user_classifications(user_field_metadata)
|
|
257
266
|
paragraph_pages = ParagraphPages(page_positions) if page_positions else None
|
|
267
|
+
|
|
258
268
|
# Splits of the field
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
269
|
+
|
|
270
|
+
# Used to adjust the paragraph start/end when indexing splits, as they are all
|
|
271
|
+
# concatenated in the main text part of the brain Resource.
|
|
272
|
+
split_offset = 0
|
|
273
|
+
for subfield in self.sorted_splits(extracted_text):
|
|
274
|
+
if subfield not in field_computed_metadata.split_metadata or should_skip_split_indexing(
|
|
275
|
+
subfield, replace_field, append_splits
|
|
276
|
+
):
|
|
277
|
+
# We're skipping this split but we need to adjust the offset as we have added the text
|
|
278
|
+
# of this split to the main text
|
|
279
|
+
split_offset += len(extracted_text.split_text[subfield]) + 1 # +1 for the space
|
|
264
280
|
continue
|
|
281
|
+
field_metadata = field_computed_metadata.split_metadata[subfield]
|
|
265
282
|
extracted_text_str = extracted_text.split_text[subfield]
|
|
266
283
|
for idx, paragraph in enumerate(field_metadata.paragraphs):
|
|
267
284
|
key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
|
@@ -288,8 +305,8 @@ class ResourceBrain:
|
|
|
288
305
|
representation.file = paragraph.representation.reference_file
|
|
289
306
|
representation.is_a_table = paragraph.representation.is_a_table
|
|
290
307
|
p = BrainParagraph(
|
|
291
|
-
start=paragraph.start,
|
|
292
|
-
end=paragraph.end,
|
|
308
|
+
start=paragraph.start + split_offset,
|
|
309
|
+
end=paragraph.end + split_offset,
|
|
293
310
|
field=field_key,
|
|
294
311
|
split=subfield,
|
|
295
312
|
index=idx,
|
|
@@ -304,6 +321,7 @@ class ResourceBrain:
|
|
|
304
321
|
representation=representation,
|
|
305
322
|
),
|
|
306
323
|
)
|
|
324
|
+
split_offset = p.end + 1 # +1 for the space
|
|
307
325
|
paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
|
|
308
326
|
paragraph_labels = {paragraph_kind_label}
|
|
309
327
|
paragraph_labels.update(
|
|
@@ -372,7 +390,7 @@ class ResourceBrain:
|
|
|
372
390
|
self.brain.paragraphs_to_delete.append(full_field_id)
|
|
373
391
|
|
|
374
392
|
def _get_paragraph_user_classifications(
|
|
375
|
-
self, basic_user_field_metadata:
|
|
393
|
+
self, basic_user_field_metadata: UserFieldMetadata | None
|
|
376
394
|
) -> ParagraphClassifications:
|
|
377
395
|
pc = ParagraphClassifications(valid={}, denied={})
|
|
378
396
|
if basic_user_field_metadata is None:
|
|
@@ -391,18 +409,16 @@ class ResourceBrain:
|
|
|
391
409
|
def generate_relations(
|
|
392
410
|
self,
|
|
393
411
|
field_key: str,
|
|
394
|
-
field_computed_metadata:
|
|
395
|
-
basic_user_metadata:
|
|
412
|
+
field_computed_metadata: FieldComputedMetadata | None,
|
|
413
|
+
basic_user_metadata: UserMetadata | None,
|
|
396
414
|
replace_field: bool,
|
|
397
415
|
) -> None:
|
|
398
416
|
user_cancelled_labels: set[str] = (
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
]
|
|
405
|
-
)
|
|
417
|
+
{
|
|
418
|
+
f"{classification.labelset}/{classification.label}"
|
|
419
|
+
for classification in basic_user_metadata.classifications
|
|
420
|
+
if classification.cancelled_by_user
|
|
421
|
+
}
|
|
406
422
|
if basic_user_metadata
|
|
407
423
|
else set()
|
|
408
424
|
)
|
|
@@ -491,7 +507,7 @@ class ResourceBrain:
|
|
|
491
507
|
full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
|
|
492
508
|
self.brain.texts_to_delete.append(full_field_id)
|
|
493
509
|
self.brain.paragraphs_to_delete.append(full_field_id)
|
|
494
|
-
self.brain.
|
|
510
|
+
self.brain.vectors_to_delete_in_all_vectorsets.append(full_field_id)
|
|
495
511
|
self.brain.relation_fields_to_delete.append(field_key)
|
|
496
512
|
|
|
497
513
|
@observer.wrap({"type": "generate_vectors"})
|
|
@@ -503,8 +519,8 @@ class ResourceBrain:
|
|
|
503
519
|
vectorset: str,
|
|
504
520
|
replace_field: bool = False,
|
|
505
521
|
# cut to specific dimension if specified
|
|
506
|
-
vector_dimension:
|
|
507
|
-
append_splits:
|
|
522
|
+
vector_dimension: int | None = None,
|
|
523
|
+
append_splits: set[str] | None = None,
|
|
508
524
|
):
|
|
509
525
|
fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
|
|
510
526
|
for subfield, vectors in vo.split_vectors.items():
|
|
@@ -578,7 +594,7 @@ class ResourceBrain:
|
|
|
578
594
|
*,
|
|
579
595
|
vectorset: str,
|
|
580
596
|
# cut vectors if a specific dimension is specified
|
|
581
|
-
vector_dimension:
|
|
597
|
+
vector_dimension: int | None = None,
|
|
582
598
|
):
|
|
583
599
|
paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
|
|
584
600
|
sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
|
|
@@ -603,7 +619,7 @@ class ResourceBrain:
|
|
|
603
619
|
|
|
604
620
|
sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
|
|
605
621
|
|
|
606
|
-
def _set_resource_status(self, basic: Basic, previous_status:
|
|
622
|
+
def _set_resource_status(self, basic: Basic, previous_status: Metadata.Status.ValueType | None):
|
|
607
623
|
"""
|
|
608
624
|
We purposefully overwrite what we index as a status and DO NOT reflect
|
|
609
625
|
actual status with what we index.
|
|
@@ -633,32 +649,32 @@ class ResourceBrain:
|
|
|
633
649
|
return "EMPTY"
|
|
634
650
|
return METADATA_STATUS_PB_TYPE_TO_NAME_MAP[metadata.status]
|
|
635
651
|
|
|
636
|
-
def _set_resource_dates(self, basic: Basic, origin:
|
|
652
|
+
def _set_resource_dates(self, basic: Basic, origin: Origin | None):
|
|
637
653
|
"""
|
|
638
654
|
Adds the user-defined dates to the brain object. This is at resource level and applies to
|
|
639
655
|
all fields of the resource.
|
|
640
656
|
"""
|
|
641
|
-
if basic.created.seconds
|
|
657
|
+
if basic.created.seconds != 0:
|
|
642
658
|
self.brain.metadata.created.CopyFrom(basic.created)
|
|
643
659
|
else:
|
|
644
660
|
logging.warning(f"Basic metadata has no created field for {self.rid}")
|
|
645
661
|
self.brain.metadata.created.GetCurrentTime()
|
|
646
|
-
if basic.modified.seconds
|
|
662
|
+
if basic.modified.seconds != 0:
|
|
647
663
|
self.brain.metadata.modified.CopyFrom(basic.modified)
|
|
648
664
|
else:
|
|
649
|
-
if basic.created.seconds
|
|
665
|
+
if basic.created.seconds != 0:
|
|
650
666
|
self.brain.metadata.modified.CopyFrom(basic.created)
|
|
651
667
|
else:
|
|
652
668
|
self.brain.metadata.modified.GetCurrentTime()
|
|
653
669
|
|
|
654
670
|
if origin is not None:
|
|
655
671
|
# overwrite created/modified if provided on origin
|
|
656
|
-
if origin.HasField("created") and origin.created.seconds
|
|
672
|
+
if origin.HasField("created") and origin.created.seconds != 0:
|
|
657
673
|
self.brain.metadata.created.CopyFrom(origin.created)
|
|
658
|
-
if origin.HasField("modified") and origin.modified.seconds
|
|
674
|
+
if origin.HasField("modified") and origin.modified.seconds != 0:
|
|
659
675
|
self.brain.metadata.modified.CopyFrom(origin.modified)
|
|
660
676
|
|
|
661
|
-
def _set_resource_relations(self, basic: Basic, origin:
|
|
677
|
+
def _set_resource_relations(self, basic: Basic, origin: Origin | None, user_relations: Relations):
|
|
662
678
|
"""
|
|
663
679
|
Adds the relations to the brain object corresponding to the user-defined metadata at the resource level:
|
|
664
680
|
- Contributors of the document
|
|
@@ -702,7 +718,7 @@ class ResourceBrain:
|
|
|
702
718
|
|
|
703
719
|
self.brain.relation_fields_to_delete.append("a/metadata")
|
|
704
720
|
|
|
705
|
-
def _set_resource_labels(self, basic: Basic, origin:
|
|
721
|
+
def _set_resource_labels(self, basic: Basic, origin: Origin | None):
|
|
706
722
|
"""
|
|
707
723
|
Adds the resource-level labels to the brain object.
|
|
708
724
|
These levels are user-defined in basic or origin metadata.
|
|
@@ -759,7 +775,7 @@ class ResourceBrain:
|
|
|
759
775
|
|
|
760
776
|
def is_paragraph_repeated_in_field(
|
|
761
777
|
paragraph: Paragraph,
|
|
762
|
-
extracted_text:
|
|
778
|
+
extracted_text: str | None,
|
|
763
779
|
unique_paragraphs: set[str],
|
|
764
780
|
) -> bool:
|
|
765
781
|
if extracted_text is None:
|
|
@@ -798,15 +814,13 @@ class ParagraphPages:
|
|
|
798
814
|
return self._materialized[paragraph_start_index]
|
|
799
815
|
except IndexError:
|
|
800
816
|
logger.error(
|
|
801
|
-
f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}"
|
|
817
|
+
f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}"
|
|
802
818
|
)
|
|
803
819
|
if len(self._materialized) > 0:
|
|
804
820
|
return self._materialized[-1]
|
|
805
821
|
return 0
|
|
806
822
|
|
|
807
823
|
|
|
808
|
-
def should_skip_split_indexing(
|
|
809
|
-
split: str, replace_field: bool, append_splits: Optional[set[str]]
|
|
810
|
-
) -> bool:
|
|
824
|
+
def should_skip_split_indexing(split: str, replace_field: bool, append_splits: set[str] | None) -> bool:
|
|
811
825
|
# When replacing the whole field, reindex all splits. Otherwise, we're only indexing the splits that are appended
|
|
812
826
|
return not replace_field and append_splits is not None and split not in append_splits
|
|
@@ -56,7 +56,7 @@ class _BrokerMessageBuilder:
|
|
|
56
56
|
# clear the state and generate a new broker message
|
|
57
57
|
self.bm.Clear()
|
|
58
58
|
|
|
59
|
-
self.bm.kbid = resource.
|
|
59
|
+
self.bm.kbid = resource.kbid
|
|
60
60
|
self.bm.uuid = resource.uuid
|
|
61
61
|
basic = await resource.get_basic()
|
|
62
62
|
if basic is not None:
|
|
@@ -93,9 +93,7 @@ class _BrokerMessageBuilder:
|
|
|
93
93
|
self.bm.link_extracted_data.append(link_extracted_data)
|
|
94
94
|
|
|
95
95
|
# Field vectors
|
|
96
|
-
async for vectorset_id, vs in datamanagers.vectorsets.iter(
|
|
97
|
-
resource.txn, kbid=resource.kb.kbid
|
|
98
|
-
):
|
|
96
|
+
async for vectorset_id, vs in datamanagers.vectorsets.iter(resource.txn, kbid=resource.kbid):
|
|
99
97
|
await self.generate_field_vectors(
|
|
100
98
|
type_id, field_id, field, vectorset_id, vs.storage_key_kind
|
|
101
99
|
)
|
nucliadb/ingest/orm/entities.py
CHANGED
|
@@ -18,8 +18,7 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
|
|
21
|
-
import
|
|
22
|
-
from typing import AsyncGenerator, Optional
|
|
21
|
+
from collections.abc import AsyncGenerator
|
|
23
22
|
|
|
24
23
|
from nidx_protos.nodereader_pb2 import (
|
|
25
24
|
Faceted,
|
|
@@ -29,23 +28,12 @@ from nidx_protos.nodereader_pb2 import (
|
|
|
29
28
|
SearchResponse,
|
|
30
29
|
)
|
|
31
30
|
|
|
32
|
-
from nucliadb.common import datamanagers
|
|
33
|
-
from nucliadb.common.cluster.exceptions import (
|
|
34
|
-
AlreadyExists,
|
|
35
|
-
EntitiesGroupNotFound,
|
|
36
|
-
)
|
|
37
31
|
from nucliadb.common.cluster.utils import get_shard_manager
|
|
38
|
-
from nucliadb.common.datamanagers.entities import (
|
|
39
|
-
KB_DELETED_ENTITIES_GROUPS,
|
|
40
|
-
KB_ENTITIES,
|
|
41
|
-
KB_ENTITIES_GROUP,
|
|
42
|
-
)
|
|
43
32
|
from nucliadb.common.maindb.driver import Transaction
|
|
44
33
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
|
45
34
|
from nucliadb.ingest.settings import settings
|
|
46
35
|
from nucliadb.search.search.shards import graph_search_shard, query_shard
|
|
47
36
|
from nucliadb_protos.knowledgebox_pb2 import (
|
|
48
|
-
DeletedEntitiesGroups,
|
|
49
37
|
EntitiesGroup,
|
|
50
38
|
EntitiesGroupSummary,
|
|
51
39
|
Entity,
|
|
@@ -53,8 +41,6 @@ from nucliadb_protos.knowledgebox_pb2 import (
|
|
|
53
41
|
from nucliadb_protos.utils_pb2 import RelationNode
|
|
54
42
|
from nucliadb_protos.writer_pb2 import GetEntitiesResponse
|
|
55
43
|
|
|
56
|
-
from .exceptions import EntityManagementException
|
|
57
|
-
|
|
58
44
|
MAX_DUPLICATES = 300
|
|
59
45
|
MAX_DELETED = 300
|
|
60
46
|
|
|
@@ -69,20 +55,11 @@ class EntitiesManager:
|
|
|
69
55
|
self.txn = txn
|
|
70
56
|
self.kbid = self.kb.kbid
|
|
71
57
|
|
|
72
|
-
async def create_entities_group(self, group: str, entities: EntitiesGroup):
|
|
73
|
-
if await self.entities_group_exists(group):
|
|
74
|
-
raise AlreadyExists(f"Entities group {group} already exists")
|
|
75
|
-
|
|
76
|
-
await self.store_entities_group(group, entities)
|
|
77
|
-
|
|
78
58
|
async def get_entities(self, entities: GetEntitiesResponse):
|
|
79
59
|
async for group, eg in self.iterate_entities_groups(exclude_deleted=True):
|
|
80
60
|
entities.groups[group].CopyFrom(eg)
|
|
81
61
|
|
|
82
|
-
async def get_entities_group(self, group: str) ->
|
|
83
|
-
deleted = await self.is_entities_group_deleted(group)
|
|
84
|
-
if deleted:
|
|
85
|
-
return None
|
|
62
|
+
async def get_entities_group(self, group: str) -> EntitiesGroup | None:
|
|
86
63
|
return await self.get_entities_group_inner(group)
|
|
87
64
|
|
|
88
65
|
async def get_entities_groups(self) -> dict[str, EntitiesGroup]:
|
|
@@ -93,113 +70,18 @@ class EntitiesManager:
|
|
|
93
70
|
|
|
94
71
|
async def list_entities_groups(self) -> dict[str, EntitiesGroupSummary]:
|
|
95
72
|
groups = {}
|
|
96
|
-
max_simultaneous = asyncio.Semaphore(10)
|
|
97
73
|
|
|
98
|
-
async
|
|
99
|
-
|
|
100
|
-
stored = await self.get_stored_entities_group(group)
|
|
101
|
-
if stored is not None:
|
|
102
|
-
groups[group] = EntitiesGroupSummary(
|
|
103
|
-
title=stored.title, color=stored.color, custom=stored.custom
|
|
104
|
-
)
|
|
105
|
-
else:
|
|
106
|
-
# We don't want to search for each indexed group, as we are
|
|
107
|
-
# providing a quick summary
|
|
108
|
-
groups[group] = EntitiesGroupSummary()
|
|
74
|
+
async for group in self.iterate_entities_groups_names(exclude_deleted=True):
|
|
75
|
+
groups[group] = EntitiesGroupSummary()
|
|
109
76
|
|
|
110
|
-
tasks = [
|
|
111
|
-
asyncio.create_task(_composition(group))
|
|
112
|
-
async for group in self.iterate_entities_groups_names(exclude_deleted=True)
|
|
113
|
-
]
|
|
114
|
-
if tasks:
|
|
115
|
-
await asyncio.wait(tasks)
|
|
116
77
|
return groups
|
|
117
78
|
|
|
118
|
-
async def update_entities(self, group: str, entities: dict[str, Entity]):
|
|
119
|
-
"""Update entities on an entity group. New entities are appended and existing
|
|
120
|
-
are overwriten. Existing entities not appearing in `entities` are left
|
|
121
|
-
intact. Use `delete_entities` to delete them instead.
|
|
122
|
-
|
|
123
|
-
"""
|
|
124
|
-
if not await self.entities_group_exists(group):
|
|
125
|
-
raise EntitiesGroupNotFound(f"Entities group '{group}' doesn't exist")
|
|
126
|
-
|
|
127
|
-
entities_group = await self.get_stored_entities_group(group)
|
|
128
|
-
if entities_group is None:
|
|
129
|
-
entities_group = EntitiesGroup()
|
|
130
|
-
|
|
131
|
-
for name, entity in entities.items():
|
|
132
|
-
entities_group.entities[name].CopyFrom(entity)
|
|
133
|
-
|
|
134
|
-
await self.store_entities_group(group, entities_group)
|
|
135
|
-
|
|
136
|
-
async def set_entities_group(self, group: str, entities: EntitiesGroup):
|
|
137
|
-
indexed = await self.get_indexed_entities_group(group)
|
|
138
|
-
if indexed is None:
|
|
139
|
-
updated = entities
|
|
140
|
-
else:
|
|
141
|
-
updated = EntitiesGroup()
|
|
142
|
-
updated.CopyFrom(entities)
|
|
143
|
-
|
|
144
|
-
for name, entity in indexed.entities.items():
|
|
145
|
-
if name not in updated.entities:
|
|
146
|
-
updated.entities[name].CopyFrom(entity)
|
|
147
|
-
updated.entities[name].deleted = True
|
|
148
|
-
|
|
149
|
-
await self.store_entities_group(group, updated)
|
|
150
|
-
|
|
151
|
-
async def set_entities_group_force(self, group: str, entitiesgroup: EntitiesGroup):
|
|
152
|
-
await self.store_entities_group(group, entitiesgroup)
|
|
153
|
-
|
|
154
|
-
async def set_entities_group_metadata(
|
|
155
|
-
self, group: str, *, title: Optional[str] = None, color: Optional[str] = None
|
|
156
|
-
):
|
|
157
|
-
entities_group = await self.get_stored_entities_group(group)
|
|
158
|
-
if entities_group is None:
|
|
159
|
-
entities_group = EntitiesGroup()
|
|
160
|
-
|
|
161
|
-
if title:
|
|
162
|
-
entities_group.title = title
|
|
163
|
-
if color:
|
|
164
|
-
entities_group.color = color
|
|
165
|
-
|
|
166
|
-
await self.store_entities_group(group, entities_group)
|
|
167
|
-
|
|
168
|
-
async def delete_entities(self, group: str, delete: list[str]):
|
|
169
|
-
stored = await self.get_stored_entities_group(group)
|
|
170
|
-
|
|
171
|
-
stored = stored or EntitiesGroup()
|
|
172
|
-
for name in delete:
|
|
173
|
-
if name not in stored.entities:
|
|
174
|
-
entity = stored.entities[name]
|
|
175
|
-
entity.value = name
|
|
176
|
-
else:
|
|
177
|
-
entity = stored.entities[name]
|
|
178
|
-
entity.deleted = True
|
|
179
|
-
await self.store_entities_group(group, stored)
|
|
180
|
-
|
|
181
|
-
async def delete_entities_group(self, group: str):
|
|
182
|
-
await self.delete_stored_entities_group(group)
|
|
183
|
-
await self.mark_entities_group_as_deleted(group)
|
|
184
|
-
|
|
185
79
|
# Private API
|
|
186
80
|
|
|
187
|
-
async def get_entities_group_inner(self, group: str) ->
|
|
188
|
-
|
|
189
|
-
indexed = await self.get_indexed_entities_group(group)
|
|
190
|
-
if stored is None and indexed is None:
|
|
191
|
-
# Entity group does not exist
|
|
192
|
-
return None
|
|
193
|
-
elif stored is not None and indexed is not None:
|
|
194
|
-
entities_group = self.merge_entities_groups(indexed, stored)
|
|
195
|
-
else:
|
|
196
|
-
entities_group = stored or indexed
|
|
197
|
-
return entities_group
|
|
198
|
-
|
|
199
|
-
async def get_stored_entities_group(self, group: str) -> Optional[EntitiesGroup]:
|
|
200
|
-
return await datamanagers.entities.get_entities_group(self.txn, kbid=self.kbid, group=group)
|
|
81
|
+
async def get_entities_group_inner(self, group: str) -> EntitiesGroup | None:
|
|
82
|
+
return await self.get_indexed_entities_group(group)
|
|
201
83
|
|
|
202
|
-
async def get_indexed_entities_group(self, group: str) ->
|
|
84
|
+
async def get_indexed_entities_group(self, group: str) -> EntitiesGroup | None:
|
|
203
85
|
shard_manager = get_shard_manager()
|
|
204
86
|
|
|
205
87
|
async def do_entities_search(shard_id: str) -> GraphSearchResponse:
|
|
@@ -228,26 +110,9 @@ class EntitiesManager:
|
|
|
228
110
|
eg = EntitiesGroup(entities=entities)
|
|
229
111
|
return eg
|
|
230
112
|
|
|
231
|
-
async def get_deleted_entities_groups(self) -> set[str]:
|
|
232
|
-
deleted: set[str] = set()
|
|
233
|
-
key = KB_DELETED_ENTITIES_GROUPS.format(kbid=self.kbid)
|
|
234
|
-
payload = await self.txn.get(key)
|
|
235
|
-
if payload:
|
|
236
|
-
deg = DeletedEntitiesGroups()
|
|
237
|
-
deg.ParseFromString(payload)
|
|
238
|
-
deleted.update(deg.entities_groups)
|
|
239
|
-
return deleted
|
|
240
|
-
|
|
241
113
|
async def entities_group_exists(self, group: str) -> bool:
|
|
242
|
-
stored = await self.get_stored_entities_group(group)
|
|
243
|
-
if stored is not None:
|
|
244
|
-
return True
|
|
245
|
-
|
|
246
114
|
indexed = await self.get_indexed_entities_group(group)
|
|
247
|
-
|
|
248
|
-
return True
|
|
249
|
-
|
|
250
|
-
return False
|
|
115
|
+
return indexed is not None
|
|
251
116
|
|
|
252
117
|
async def iterate_entities_groups(
|
|
253
118
|
self, exclude_deleted: bool
|
|
@@ -262,27 +127,10 @@ class EntitiesManager:
|
|
|
262
127
|
self,
|
|
263
128
|
exclude_deleted: bool,
|
|
264
129
|
) -> AsyncGenerator[str, None]:
|
|
265
|
-
# Start the task to get indexed groups
|
|
266
|
-
indexed_task = asyncio.create_task(self.get_indexed_entities_groups_names())
|
|
267
|
-
|
|
268
|
-
if exclude_deleted:
|
|
269
|
-
deleted_groups = await self.get_deleted_entities_groups()
|
|
270
|
-
|
|
271
130
|
visited_groups = set()
|
|
272
|
-
|
|
273
|
-
# stored groups
|
|
274
|
-
entities_key = KB_ENTITIES.format(kbid=self.kbid)
|
|
275
|
-
async for key in self.txn.keys(entities_key):
|
|
276
|
-
group = key.split("/")[-1]
|
|
277
|
-
if exclude_deleted and group in deleted_groups:
|
|
278
|
-
continue
|
|
279
|
-
yield group
|
|
280
|
-
visited_groups.add(group)
|
|
281
|
-
|
|
282
|
-
# indexed groups
|
|
283
|
-
indexed_groups = await indexed_task
|
|
131
|
+
indexed_groups = await self.get_indexed_entities_groups_names()
|
|
284
132
|
for group in indexed_groups:
|
|
285
|
-
if
|
|
133
|
+
if group in visited_groups:
|
|
286
134
|
continue
|
|
287
135
|
yield group
|
|
288
136
|
visited_groups.add(group)
|
|
@@ -319,53 +167,6 @@ class EntitiesManager:
|
|
|
319
167
|
return set()
|
|
320
168
|
return set.union(*results)
|
|
321
169
|
|
|
322
|
-
async def store_entities_group(self, group: str, eg: EntitiesGroup):
|
|
323
|
-
meta_cache = await datamanagers.entities.get_entities_meta_cache(self.txn, kbid=self.kbid)
|
|
324
|
-
duplicates = {}
|
|
325
|
-
deleted = []
|
|
326
|
-
duplicate_count = 0
|
|
327
|
-
for entity in eg.entities.values():
|
|
328
|
-
if entity.deleted:
|
|
329
|
-
deleted.append(entity.value)
|
|
330
|
-
continue
|
|
331
|
-
if len(entity.represents) == 0:
|
|
332
|
-
continue
|
|
333
|
-
duplicates[entity.value] = list(entity.represents)
|
|
334
|
-
duplicate_count += len(duplicates[entity.value])
|
|
335
|
-
|
|
336
|
-
if duplicate_count > MAX_DUPLICATES:
|
|
337
|
-
raise EntityManagementException(
|
|
338
|
-
f"Too many duplicates: {duplicate_count}. Max of {MAX_DUPLICATES} currently allowed"
|
|
339
|
-
)
|
|
340
|
-
if len(deleted) > MAX_DELETED:
|
|
341
|
-
raise EntityManagementException(
|
|
342
|
-
f"Too many deleted entities: {len(deleted)}. Max of {MAX_DELETED} currently allowed"
|
|
343
|
-
)
|
|
344
|
-
|
|
345
|
-
meta_cache.set_duplicates(group, duplicates)
|
|
346
|
-
meta_cache.set_deleted(group, deleted)
|
|
347
|
-
await datamanagers.entities.set_entities_meta_cache(self.txn, kbid=self.kbid, cache=meta_cache)
|
|
348
|
-
|
|
349
|
-
await datamanagers.entities.set_entities_group(
|
|
350
|
-
self.txn, kbid=self.kbid, group_id=group, entities=eg
|
|
351
|
-
)
|
|
352
|
-
# if it was preivously deleted, we must unmark it
|
|
353
|
-
await self.unmark_entities_group_as_deleted(group)
|
|
354
|
-
|
|
355
|
-
async def is_entities_group_deleted(self, group: str):
|
|
356
|
-
deleted_groups = await self.get_deleted_entities_groups()
|
|
357
|
-
return group in deleted_groups
|
|
358
|
-
|
|
359
|
-
async def delete_stored_entities_group(self, group: str):
|
|
360
|
-
entities_key = KB_ENTITIES_GROUP.format(kbid=self.kbid, id=group)
|
|
361
|
-
await self.txn.delete(entities_key)
|
|
362
|
-
|
|
363
|
-
async def mark_entities_group_as_deleted(self, group: str):
|
|
364
|
-
await datamanagers.entities.mark_group_as_deleted(self.txn, kbid=self.kbid, group=group)
|
|
365
|
-
|
|
366
|
-
async def unmark_entities_group_as_deleted(self, group: str):
|
|
367
|
-
await datamanagers.entities.unmark_group_as_deleted(self.txn, kbid=self.kbid, group=group)
|
|
368
|
-
|
|
369
170
|
@staticmethod
|
|
370
171
|
def merge_entities_groups(indexed: EntitiesGroup, stored: EntitiesGroup):
|
|
371
172
|
"""Create a new EntitiesGroup with the merged entities from `stored` and
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
import asyncio
|
|
23
|
-
from
|
|
23
|
+
from collections.abc import Sequence
|
|
24
24
|
|
|
25
25
|
from nidx_protos.noderesources_pb2 import Resource as IndexMessage
|
|
26
26
|
|
|
@@ -70,8 +70,8 @@ class IndexMessageBuilder:
|
|
|
70
70
|
vectors: bool = True,
|
|
71
71
|
relations: bool = True,
|
|
72
72
|
replace: bool = True,
|
|
73
|
-
vectorset_configs:
|
|
74
|
-
append_splits:
|
|
73
|
+
vectorset_configs: list[VectorSetConfig] | None = None,
|
|
74
|
+
append_splits: set[str] | None = None,
|
|
75
75
|
):
|
|
76
76
|
field = await self.resource.get_field(fieldid.field, fieldid.field_type)
|
|
77
77
|
extracted_text = await field.get_extracted_text()
|
|
@@ -281,7 +281,7 @@ class IndexMessageBuilder:
|
|
|
281
281
|
vectorset_configs = [
|
|
282
282
|
vectorset_config
|
|
283
283
|
async for _, vectorset_config in datamanagers.vectorsets.iter(
|
|
284
|
-
self.resource.txn, kbid=self.resource.
|
|
284
|
+
self.resource.txn, kbid=self.resource.kbid
|
|
285
285
|
)
|
|
286
286
|
]
|
|
287
287
|
return vectorset_configs
|