nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/ingest/orm/entities.py
CHANGED
|
@@ -18,8 +18,7 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
|
|
21
|
-
import
|
|
22
|
-
from typing import AsyncGenerator, Optional
|
|
21
|
+
from collections.abc import AsyncGenerator
|
|
23
22
|
|
|
24
23
|
from nidx_protos.nodereader_pb2 import (
|
|
25
24
|
Faceted,
|
|
@@ -29,23 +28,12 @@ from nidx_protos.nodereader_pb2 import (
|
|
|
29
28
|
SearchResponse,
|
|
30
29
|
)
|
|
31
30
|
|
|
32
|
-
from nucliadb.common import datamanagers
|
|
33
|
-
from nucliadb.common.cluster.exceptions import (
|
|
34
|
-
AlreadyExists,
|
|
35
|
-
EntitiesGroupNotFound,
|
|
36
|
-
)
|
|
37
31
|
from nucliadb.common.cluster.utils import get_shard_manager
|
|
38
|
-
from nucliadb.common.datamanagers.entities import (
|
|
39
|
-
KB_DELETED_ENTITIES_GROUPS,
|
|
40
|
-
KB_ENTITIES,
|
|
41
|
-
KB_ENTITIES_GROUP,
|
|
42
|
-
)
|
|
43
32
|
from nucliadb.common.maindb.driver import Transaction
|
|
44
33
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
|
45
34
|
from nucliadb.ingest.settings import settings
|
|
46
35
|
from nucliadb.search.search.shards import graph_search_shard, query_shard
|
|
47
36
|
from nucliadb_protos.knowledgebox_pb2 import (
|
|
48
|
-
DeletedEntitiesGroups,
|
|
49
37
|
EntitiesGroup,
|
|
50
38
|
EntitiesGroupSummary,
|
|
51
39
|
Entity,
|
|
@@ -53,8 +41,6 @@ from nucliadb_protos.knowledgebox_pb2 import (
|
|
|
53
41
|
from nucliadb_protos.utils_pb2 import RelationNode
|
|
54
42
|
from nucliadb_protos.writer_pb2 import GetEntitiesResponse
|
|
55
43
|
|
|
56
|
-
from .exceptions import EntityManagementException
|
|
57
|
-
|
|
58
44
|
MAX_DUPLICATES = 300
|
|
59
45
|
MAX_DELETED = 300
|
|
60
46
|
|
|
@@ -69,20 +55,11 @@ class EntitiesManager:
|
|
|
69
55
|
self.txn = txn
|
|
70
56
|
self.kbid = self.kb.kbid
|
|
71
57
|
|
|
72
|
-
async def create_entities_group(self, group: str, entities: EntitiesGroup):
|
|
73
|
-
if await self.entities_group_exists(group):
|
|
74
|
-
raise AlreadyExists(f"Entities group {group} already exists")
|
|
75
|
-
|
|
76
|
-
await self.store_entities_group(group, entities)
|
|
77
|
-
|
|
78
58
|
async def get_entities(self, entities: GetEntitiesResponse):
|
|
79
59
|
async for group, eg in self.iterate_entities_groups(exclude_deleted=True):
|
|
80
60
|
entities.groups[group].CopyFrom(eg)
|
|
81
61
|
|
|
82
|
-
async def get_entities_group(self, group: str) ->
|
|
83
|
-
deleted = await self.is_entities_group_deleted(group)
|
|
84
|
-
if deleted:
|
|
85
|
-
return None
|
|
62
|
+
async def get_entities_group(self, group: str) -> EntitiesGroup | None:
|
|
86
63
|
return await self.get_entities_group_inner(group)
|
|
87
64
|
|
|
88
65
|
async def get_entities_groups(self) -> dict[str, EntitiesGroup]:
|
|
@@ -93,113 +70,18 @@ class EntitiesManager:
|
|
|
93
70
|
|
|
94
71
|
async def list_entities_groups(self) -> dict[str, EntitiesGroupSummary]:
|
|
95
72
|
groups = {}
|
|
96
|
-
max_simultaneous = asyncio.Semaphore(10)
|
|
97
73
|
|
|
98
|
-
async
|
|
99
|
-
|
|
100
|
-
stored = await self.get_stored_entities_group(group)
|
|
101
|
-
if stored is not None:
|
|
102
|
-
groups[group] = EntitiesGroupSummary(
|
|
103
|
-
title=stored.title, color=stored.color, custom=stored.custom
|
|
104
|
-
)
|
|
105
|
-
else:
|
|
106
|
-
# We don't want to search for each indexed group, as we are
|
|
107
|
-
# providing a quick summary
|
|
108
|
-
groups[group] = EntitiesGroupSummary()
|
|
74
|
+
async for group in self.iterate_entities_groups_names(exclude_deleted=True):
|
|
75
|
+
groups[group] = EntitiesGroupSummary()
|
|
109
76
|
|
|
110
|
-
tasks = [
|
|
111
|
-
asyncio.create_task(_composition(group))
|
|
112
|
-
async for group in self.iterate_entities_groups_names(exclude_deleted=True)
|
|
113
|
-
]
|
|
114
|
-
if tasks:
|
|
115
|
-
await asyncio.wait(tasks)
|
|
116
77
|
return groups
|
|
117
78
|
|
|
118
|
-
async def update_entities(self, group: str, entities: dict[str, Entity]):
|
|
119
|
-
"""Update entities on an entity group. New entities are appended and existing
|
|
120
|
-
are overwriten. Existing entities not appearing in `entities` are left
|
|
121
|
-
intact. Use `delete_entities` to delete them instead.
|
|
122
|
-
|
|
123
|
-
"""
|
|
124
|
-
if not await self.entities_group_exists(group):
|
|
125
|
-
raise EntitiesGroupNotFound(f"Entities group '{group}' doesn't exist")
|
|
126
|
-
|
|
127
|
-
entities_group = await self.get_stored_entities_group(group)
|
|
128
|
-
if entities_group is None:
|
|
129
|
-
entities_group = EntitiesGroup()
|
|
130
|
-
|
|
131
|
-
for name, entity in entities.items():
|
|
132
|
-
entities_group.entities[name].CopyFrom(entity)
|
|
133
|
-
|
|
134
|
-
await self.store_entities_group(group, entities_group)
|
|
135
|
-
|
|
136
|
-
async def set_entities_group(self, group: str, entities: EntitiesGroup):
|
|
137
|
-
indexed = await self.get_indexed_entities_group(group)
|
|
138
|
-
if indexed is None:
|
|
139
|
-
updated = entities
|
|
140
|
-
else:
|
|
141
|
-
updated = EntitiesGroup()
|
|
142
|
-
updated.CopyFrom(entities)
|
|
143
|
-
|
|
144
|
-
for name, entity in indexed.entities.items():
|
|
145
|
-
if name not in updated.entities:
|
|
146
|
-
updated.entities[name].CopyFrom(entity)
|
|
147
|
-
updated.entities[name].deleted = True
|
|
148
|
-
|
|
149
|
-
await self.store_entities_group(group, updated)
|
|
150
|
-
|
|
151
|
-
async def set_entities_group_force(self, group: str, entitiesgroup: EntitiesGroup):
|
|
152
|
-
await self.store_entities_group(group, entitiesgroup)
|
|
153
|
-
|
|
154
|
-
async def set_entities_group_metadata(
|
|
155
|
-
self, group: str, *, title: Optional[str] = None, color: Optional[str] = None
|
|
156
|
-
):
|
|
157
|
-
entities_group = await self.get_stored_entities_group(group)
|
|
158
|
-
if entities_group is None:
|
|
159
|
-
entities_group = EntitiesGroup()
|
|
160
|
-
|
|
161
|
-
if title:
|
|
162
|
-
entities_group.title = title
|
|
163
|
-
if color:
|
|
164
|
-
entities_group.color = color
|
|
165
|
-
|
|
166
|
-
await self.store_entities_group(group, entities_group)
|
|
167
|
-
|
|
168
|
-
async def delete_entities(self, group: str, delete: list[str]):
|
|
169
|
-
stored = await self.get_stored_entities_group(group)
|
|
170
|
-
|
|
171
|
-
stored = stored or EntitiesGroup()
|
|
172
|
-
for name in delete:
|
|
173
|
-
if name not in stored.entities:
|
|
174
|
-
entity = stored.entities[name]
|
|
175
|
-
entity.value = name
|
|
176
|
-
else:
|
|
177
|
-
entity = stored.entities[name]
|
|
178
|
-
entity.deleted = True
|
|
179
|
-
await self.store_entities_group(group, stored)
|
|
180
|
-
|
|
181
|
-
async def delete_entities_group(self, group: str):
|
|
182
|
-
await self.delete_stored_entities_group(group)
|
|
183
|
-
await self.mark_entities_group_as_deleted(group)
|
|
184
|
-
|
|
185
79
|
# Private API
|
|
186
80
|
|
|
187
|
-
async def get_entities_group_inner(self, group: str) ->
|
|
188
|
-
|
|
189
|
-
indexed = await self.get_indexed_entities_group(group)
|
|
190
|
-
if stored is None and indexed is None:
|
|
191
|
-
# Entity group does not exist
|
|
192
|
-
return None
|
|
193
|
-
elif stored is not None and indexed is not None:
|
|
194
|
-
entities_group = self.merge_entities_groups(indexed, stored)
|
|
195
|
-
else:
|
|
196
|
-
entities_group = stored or indexed
|
|
197
|
-
return entities_group
|
|
198
|
-
|
|
199
|
-
async def get_stored_entities_group(self, group: str) -> Optional[EntitiesGroup]:
|
|
200
|
-
return await datamanagers.entities.get_entities_group(self.txn, kbid=self.kbid, group=group)
|
|
81
|
+
async def get_entities_group_inner(self, group: str) -> EntitiesGroup | None:
|
|
82
|
+
return await self.get_indexed_entities_group(group)
|
|
201
83
|
|
|
202
|
-
async def get_indexed_entities_group(self, group: str) ->
|
|
84
|
+
async def get_indexed_entities_group(self, group: str) -> EntitiesGroup | None:
|
|
203
85
|
shard_manager = get_shard_manager()
|
|
204
86
|
|
|
205
87
|
async def do_entities_search(shard_id: str) -> GraphSearchResponse:
|
|
@@ -228,26 +110,9 @@ class EntitiesManager:
|
|
|
228
110
|
eg = EntitiesGroup(entities=entities)
|
|
229
111
|
return eg
|
|
230
112
|
|
|
231
|
-
async def get_deleted_entities_groups(self) -> set[str]:
|
|
232
|
-
deleted: set[str] = set()
|
|
233
|
-
key = KB_DELETED_ENTITIES_GROUPS.format(kbid=self.kbid)
|
|
234
|
-
payload = await self.txn.get(key)
|
|
235
|
-
if payload:
|
|
236
|
-
deg = DeletedEntitiesGroups()
|
|
237
|
-
deg.ParseFromString(payload)
|
|
238
|
-
deleted.update(deg.entities_groups)
|
|
239
|
-
return deleted
|
|
240
|
-
|
|
241
113
|
async def entities_group_exists(self, group: str) -> bool:
|
|
242
|
-
stored = await self.get_stored_entities_group(group)
|
|
243
|
-
if stored is not None:
|
|
244
|
-
return True
|
|
245
|
-
|
|
246
114
|
indexed = await self.get_indexed_entities_group(group)
|
|
247
|
-
|
|
248
|
-
return True
|
|
249
|
-
|
|
250
|
-
return False
|
|
115
|
+
return indexed is not None
|
|
251
116
|
|
|
252
117
|
async def iterate_entities_groups(
|
|
253
118
|
self, exclude_deleted: bool
|
|
@@ -262,27 +127,10 @@ class EntitiesManager:
|
|
|
262
127
|
self,
|
|
263
128
|
exclude_deleted: bool,
|
|
264
129
|
) -> AsyncGenerator[str, None]:
|
|
265
|
-
# Start the task to get indexed groups
|
|
266
|
-
indexed_task = asyncio.create_task(self.get_indexed_entities_groups_names())
|
|
267
|
-
|
|
268
|
-
if exclude_deleted:
|
|
269
|
-
deleted_groups = await self.get_deleted_entities_groups()
|
|
270
|
-
|
|
271
130
|
visited_groups = set()
|
|
272
|
-
|
|
273
|
-
# stored groups
|
|
274
|
-
entities_key = KB_ENTITIES.format(kbid=self.kbid)
|
|
275
|
-
async for key in self.txn.keys(entities_key):
|
|
276
|
-
group = key.split("/")[-1]
|
|
277
|
-
if exclude_deleted and group in deleted_groups:
|
|
278
|
-
continue
|
|
279
|
-
yield group
|
|
280
|
-
visited_groups.add(group)
|
|
281
|
-
|
|
282
|
-
# indexed groups
|
|
283
|
-
indexed_groups = await indexed_task
|
|
131
|
+
indexed_groups = await self.get_indexed_entities_groups_names()
|
|
284
132
|
for group in indexed_groups:
|
|
285
|
-
if
|
|
133
|
+
if group in visited_groups:
|
|
286
134
|
continue
|
|
287
135
|
yield group
|
|
288
136
|
visited_groups.add(group)
|
|
@@ -319,53 +167,6 @@ class EntitiesManager:
|
|
|
319
167
|
return set()
|
|
320
168
|
return set.union(*results)
|
|
321
169
|
|
|
322
|
-
async def store_entities_group(self, group: str, eg: EntitiesGroup):
|
|
323
|
-
meta_cache = await datamanagers.entities.get_entities_meta_cache(self.txn, kbid=self.kbid)
|
|
324
|
-
duplicates = {}
|
|
325
|
-
deleted = []
|
|
326
|
-
duplicate_count = 0
|
|
327
|
-
for entity in eg.entities.values():
|
|
328
|
-
if entity.deleted:
|
|
329
|
-
deleted.append(entity.value)
|
|
330
|
-
continue
|
|
331
|
-
if len(entity.represents) == 0:
|
|
332
|
-
continue
|
|
333
|
-
duplicates[entity.value] = list(entity.represents)
|
|
334
|
-
duplicate_count += len(duplicates[entity.value])
|
|
335
|
-
|
|
336
|
-
if duplicate_count > MAX_DUPLICATES:
|
|
337
|
-
raise EntityManagementException(
|
|
338
|
-
f"Too many duplicates: {duplicate_count}. Max of {MAX_DUPLICATES} currently allowed"
|
|
339
|
-
)
|
|
340
|
-
if len(deleted) > MAX_DELETED:
|
|
341
|
-
raise EntityManagementException(
|
|
342
|
-
f"Too many deleted entities: {len(deleted)}. Max of {MAX_DELETED} currently allowed"
|
|
343
|
-
)
|
|
344
|
-
|
|
345
|
-
meta_cache.set_duplicates(group, duplicates)
|
|
346
|
-
meta_cache.set_deleted(group, deleted)
|
|
347
|
-
await datamanagers.entities.set_entities_meta_cache(self.txn, kbid=self.kbid, cache=meta_cache)
|
|
348
|
-
|
|
349
|
-
await datamanagers.entities.set_entities_group(
|
|
350
|
-
self.txn, kbid=self.kbid, group_id=group, entities=eg
|
|
351
|
-
)
|
|
352
|
-
# if it was preivously deleted, we must unmark it
|
|
353
|
-
await self.unmark_entities_group_as_deleted(group)
|
|
354
|
-
|
|
355
|
-
async def is_entities_group_deleted(self, group: str):
|
|
356
|
-
deleted_groups = await self.get_deleted_entities_groups()
|
|
357
|
-
return group in deleted_groups
|
|
358
|
-
|
|
359
|
-
async def delete_stored_entities_group(self, group: str):
|
|
360
|
-
entities_key = KB_ENTITIES_GROUP.format(kbid=self.kbid, id=group)
|
|
361
|
-
await self.txn.delete(entities_key)
|
|
362
|
-
|
|
363
|
-
async def mark_entities_group_as_deleted(self, group: str):
|
|
364
|
-
await datamanagers.entities.mark_group_as_deleted(self.txn, kbid=self.kbid, group=group)
|
|
365
|
-
|
|
366
|
-
async def unmark_entities_group_as_deleted(self, group: str):
|
|
367
|
-
await datamanagers.entities.unmark_group_as_deleted(self.txn, kbid=self.kbid, group=group)
|
|
368
|
-
|
|
369
170
|
@staticmethod
|
|
370
171
|
def merge_entities_groups(indexed: EntitiesGroup, stored: EntitiesGroup):
|
|
371
172
|
"""Create a new EntitiesGroup with the merged entities from `stored` and
|
|
@@ -20,11 +20,12 @@
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
import asyncio
|
|
23
|
-
from
|
|
23
|
+
from collections.abc import Sequence
|
|
24
24
|
|
|
25
25
|
from nidx_protos.noderesources_pb2 import Resource as IndexMessage
|
|
26
26
|
|
|
27
27
|
from nucliadb.common import datamanagers
|
|
28
|
+
from nucliadb.ingest.fields.conversation import Conversation
|
|
28
29
|
from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
|
|
29
30
|
from nucliadb.ingest.fields.file import File
|
|
30
31
|
from nucliadb.ingest.orm.brain_v2 import ResourceBrain
|
|
@@ -32,6 +33,7 @@ from nucliadb.ingest.orm.metrics import index_message_observer as observer
|
|
|
32
33
|
from nucliadb.ingest.orm.resource import Resource, get_file_page_positions
|
|
33
34
|
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
|
34
35
|
from nucliadb_protos.resources_pb2 import Basic, FieldID, FieldType
|
|
36
|
+
from nucliadb_protos.utils_pb2 import ExtractedText
|
|
35
37
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
|
36
38
|
|
|
37
39
|
|
|
@@ -68,7 +70,8 @@ class IndexMessageBuilder:
|
|
|
68
70
|
vectors: bool = True,
|
|
69
71
|
relations: bool = True,
|
|
70
72
|
replace: bool = True,
|
|
71
|
-
vectorset_configs:
|
|
73
|
+
vectorset_configs: list[VectorSetConfig] | None = None,
|
|
74
|
+
append_splits: set[str] | None = None,
|
|
72
75
|
):
|
|
73
76
|
field = await self.resource.get_field(fieldid.field, fieldid.field_type)
|
|
74
77
|
extracted_text = await field.get_extracted_text()
|
|
@@ -120,6 +123,7 @@ class IndexMessageBuilder:
|
|
|
120
123
|
replace_field=replace_paragraphs,
|
|
121
124
|
skip_paragraphs_index=skip_paragraphs_index,
|
|
122
125
|
skip_texts_index=skip_texts_index,
|
|
126
|
+
append_splits=append_splits,
|
|
123
127
|
)
|
|
124
128
|
if vectors:
|
|
125
129
|
assert vectorset_configs is not None
|
|
@@ -137,6 +141,7 @@ class IndexMessageBuilder:
|
|
|
137
141
|
vectorset=vectorset_config.vectorset_id,
|
|
138
142
|
replace_field=replace,
|
|
139
143
|
vector_dimension=dimension,
|
|
144
|
+
append_splits=append_splits,
|
|
140
145
|
)
|
|
141
146
|
if relations:
|
|
142
147
|
await asyncio.to_thread(
|
|
@@ -150,7 +155,7 @@ class IndexMessageBuilder:
|
|
|
150
155
|
def _apply_field_deletions(
|
|
151
156
|
self,
|
|
152
157
|
brain: ResourceBrain,
|
|
153
|
-
field_ids:
|
|
158
|
+
field_ids: Sequence[FieldID],
|
|
154
159
|
) -> None:
|
|
155
160
|
for field_id in field_ids:
|
|
156
161
|
brain.delete_field(self.resource.generate_field_id(field_id))
|
|
@@ -158,20 +163,19 @@ class IndexMessageBuilder:
|
|
|
158
163
|
@observer.wrap({"type": "writer_bm"})
|
|
159
164
|
async def for_writer_bm(
|
|
160
165
|
self,
|
|
161
|
-
|
|
166
|
+
message: BrokerMessage,
|
|
162
167
|
resource_created: bool,
|
|
163
168
|
) -> IndexMessage:
|
|
164
169
|
"""
|
|
165
|
-
Builds the index message for the broker
|
|
170
|
+
Builds the index message for the broker message coming from the writer.
|
|
166
171
|
The writer messages are not adding new vectors to the index.
|
|
167
172
|
"""
|
|
168
|
-
assert
|
|
173
|
+
assert message.source == BrokerMessage.MessageSource.WRITER
|
|
169
174
|
|
|
170
|
-
|
|
171
|
-
self._apply_field_deletions(self.brain, deleted_fields)
|
|
175
|
+
self._apply_field_deletions(self.brain, message.delete_fields)
|
|
172
176
|
await self._apply_resource_index_data(self.brain)
|
|
173
177
|
basic = await self.get_basic()
|
|
174
|
-
prefilter_update = needs_prefilter_update(
|
|
178
|
+
prefilter_update = needs_prefilter_update(message)
|
|
175
179
|
if prefilter_update:
|
|
176
180
|
# Changes on some metadata at the resource level that is used for filtering require that we reindex all the fields
|
|
177
181
|
# in the texts index (as it is the one used for prefiltering).
|
|
@@ -181,16 +185,16 @@ class IndexMessageBuilder:
|
|
|
181
185
|
]
|
|
182
186
|
else:
|
|
183
187
|
# Simply process the fields that are in the message
|
|
184
|
-
fields_to_index = get_bm_modified_fields(
|
|
188
|
+
fields_to_index = get_bm_modified_fields(message)
|
|
185
189
|
for fieldid in fields_to_index:
|
|
186
|
-
if fieldid in
|
|
190
|
+
if fieldid in message.delete_fields:
|
|
187
191
|
continue
|
|
188
192
|
await self._apply_field_index_data(
|
|
189
193
|
self.brain,
|
|
190
194
|
fieldid,
|
|
191
195
|
basic,
|
|
192
|
-
texts=prefilter_update or needs_texts_update(fieldid,
|
|
193
|
-
paragraphs=needs_paragraphs_update(fieldid,
|
|
196
|
+
texts=prefilter_update or needs_texts_update(fieldid, message),
|
|
197
|
+
paragraphs=needs_paragraphs_update(fieldid, message),
|
|
194
198
|
relations=False, # Relations at the field level are not modified by the writer
|
|
195
199
|
vectors=False, # Vectors are never added by the writer
|
|
196
200
|
replace=not resource_created,
|
|
@@ -200,32 +204,45 @@ class IndexMessageBuilder:
|
|
|
200
204
|
@observer.wrap({"type": "processor_bm"})
|
|
201
205
|
async def for_processor_bm(
|
|
202
206
|
self,
|
|
203
|
-
|
|
207
|
+
message: BrokerMessage,
|
|
204
208
|
) -> IndexMessage:
|
|
205
209
|
"""
|
|
206
210
|
Builds the index message for the broker messages coming from the processor.
|
|
207
211
|
The processor can index new data to any index.
|
|
208
212
|
"""
|
|
209
|
-
assert
|
|
210
|
-
|
|
211
|
-
self._apply_field_deletions(self.brain, deleted_fields)
|
|
213
|
+
assert message.source == BrokerMessage.MessageSource.PROCESSOR
|
|
214
|
+
self._apply_field_deletions(self.brain, message.delete_fields)
|
|
212
215
|
await self._apply_resource_index_data(self.brain)
|
|
213
216
|
basic = await self.get_basic()
|
|
214
|
-
fields_to_index = get_bm_modified_fields(
|
|
217
|
+
fields_to_index = get_bm_modified_fields(message)
|
|
215
218
|
vectorsets_configs = await self.get_vectorsets_configs()
|
|
216
219
|
for fieldid in fields_to_index:
|
|
217
|
-
if fieldid in
|
|
220
|
+
if fieldid in message.delete_fields:
|
|
218
221
|
continue
|
|
222
|
+
|
|
223
|
+
# For conversation fields, we only replace the full field if it is not an append messages operation.
|
|
224
|
+
# All other fields are always replaced upon modification.
|
|
225
|
+
replace_field = True
|
|
226
|
+
modified_splits = None
|
|
227
|
+
if fieldid.field_type == FieldType.CONVERSATION:
|
|
228
|
+
modified_splits = await get_bm_modified_split_ids(fieldid, message, self.resource)
|
|
229
|
+
stored_splits = await get_stored_split_ids(fieldid, self.resource)
|
|
230
|
+
is_append_messages_op = modified_splits.issubset(stored_splits) and 0 < len(
|
|
231
|
+
modified_splits
|
|
232
|
+
) < len(stored_splits)
|
|
233
|
+
replace_field = not is_append_messages_op
|
|
234
|
+
|
|
219
235
|
await self._apply_field_index_data(
|
|
220
236
|
self.brain,
|
|
221
237
|
fieldid,
|
|
222
238
|
basic,
|
|
223
|
-
texts=needs_texts_update(fieldid,
|
|
224
|
-
paragraphs=needs_paragraphs_update(fieldid,
|
|
225
|
-
relations=needs_relations_update(fieldid,
|
|
226
|
-
vectors=needs_vectors_update(fieldid,
|
|
227
|
-
replace=
|
|
239
|
+
texts=needs_texts_update(fieldid, message),
|
|
240
|
+
paragraphs=needs_paragraphs_update(fieldid, message),
|
|
241
|
+
relations=needs_relations_update(fieldid, message),
|
|
242
|
+
vectors=needs_vectors_update(fieldid, message),
|
|
243
|
+
replace=replace_field,
|
|
228
244
|
vectorset_configs=vectorsets_configs,
|
|
245
|
+
append_splits=modified_splits,
|
|
229
246
|
)
|
|
230
247
|
return self.brain.brain
|
|
231
248
|
|
|
@@ -264,136 +281,134 @@ class IndexMessageBuilder:
|
|
|
264
281
|
vectorset_configs = [
|
|
265
282
|
vectorset_config
|
|
266
283
|
async for _, vectorset_config in datamanagers.vectorsets.iter(
|
|
267
|
-
self.resource.txn, kbid=self.resource.
|
|
284
|
+
self.resource.txn, kbid=self.resource.kbid
|
|
268
285
|
)
|
|
269
286
|
]
|
|
270
287
|
return vectorset_configs
|
|
271
288
|
|
|
272
289
|
|
|
273
|
-
def
|
|
274
|
-
messages: list[BrokerMessage],
|
|
275
|
-
) -> list[FieldID]:
|
|
276
|
-
deleted = []
|
|
277
|
-
for message in messages:
|
|
278
|
-
for field in message.delete_fields:
|
|
279
|
-
if field not in deleted:
|
|
280
|
-
deleted.append(field)
|
|
281
|
-
return deleted
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
def get_bm_modified_fields(messages: list[BrokerMessage]) -> list[FieldID]:
|
|
285
|
-
message_source = get_messages_source(messages)
|
|
290
|
+
def get_bm_modified_fields(message: BrokerMessage) -> list[FieldID]:
|
|
286
291
|
modified = set()
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
292
|
+
# Added or modified fields need indexing
|
|
293
|
+
for link in message.links:
|
|
294
|
+
modified.add((link, FieldType.LINK))
|
|
295
|
+
for file in message.files:
|
|
296
|
+
modified.add((file, FieldType.FILE))
|
|
297
|
+
for conv in message.conversations:
|
|
298
|
+
modified.add((conv, FieldType.CONVERSATION))
|
|
299
|
+
for text in message.texts:
|
|
300
|
+
modified.add((text, FieldType.TEXT))
|
|
301
|
+
if message.HasField("basic"):
|
|
302
|
+
# Add title and summary only if they have changed
|
|
303
|
+
if message.basic.title != "":
|
|
304
|
+
modified.add(("title", FieldType.GENERIC))
|
|
305
|
+
if message.basic.summary != "":
|
|
306
|
+
modified.add(("summary", FieldType.GENERIC))
|
|
307
|
+
|
|
308
|
+
if message.source == BrokerMessage.MessageSource.PROCESSOR:
|
|
309
|
+
# Messages with field metadata, extracted text or field vectors need indexing
|
|
310
|
+
for fm in message.field_metadata:
|
|
311
|
+
modified.add((fm.field.field, fm.field.field_type))
|
|
312
|
+
for et in message.extracted_text:
|
|
313
|
+
modified.add((et.field.field, et.field.field_type))
|
|
314
|
+
for fv in message.field_vectors:
|
|
315
|
+
modified.add((fv.field.field, fv.field.field_type))
|
|
316
|
+
|
|
317
|
+
if message.source == BrokerMessage.MessageSource.WRITER:
|
|
318
|
+
# Any field that has fieldmetadata annotations should be considered as modified
|
|
319
|
+
# and needs to be reindexed
|
|
297
320
|
if message.HasField("basic"):
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
modified.add(("title", FieldType.GENERIC))
|
|
301
|
-
if message.basic.summary != "":
|
|
302
|
-
modified.add(("summary", FieldType.GENERIC))
|
|
303
|
-
|
|
304
|
-
if message_source == BrokerMessage.MessageSource.PROCESSOR:
|
|
305
|
-
# Messages with field metadata, extracted text or field vectors need indexing
|
|
306
|
-
for fm in message.field_metadata:
|
|
307
|
-
modified.add((fm.field.field, fm.field.field_type))
|
|
308
|
-
for et in message.extracted_text:
|
|
309
|
-
modified.add((et.field.field, et.field.field_type))
|
|
310
|
-
for fv in message.field_vectors:
|
|
311
|
-
modified.add((fv.field.field, fv.field.field_type))
|
|
312
|
-
|
|
313
|
-
if message_source == BrokerMessage.MessageSource.WRITER:
|
|
314
|
-
# Any field that has fieldmetadata annotations should be considered as modified
|
|
315
|
-
# and needs to be reindexed
|
|
316
|
-
if message.HasField("basic"):
|
|
317
|
-
for ufm in message.basic.fieldmetadata:
|
|
318
|
-
modified.add((ufm.field.field, ufm.field.field_type))
|
|
321
|
+
for ufm in message.basic.fieldmetadata:
|
|
322
|
+
modified.add((ufm.field.field, ufm.field.field_type))
|
|
319
323
|
return [FieldID(field=field, field_type=field_type) for field, field_type in modified]
|
|
320
324
|
|
|
321
325
|
|
|
322
|
-
def
|
|
323
|
-
|
|
324
|
-
return messages[0].source
|
|
326
|
+
def needs_prefilter_update(message: BrokerMessage) -> bool:
|
|
327
|
+
return message.reindex
|
|
325
328
|
|
|
326
329
|
|
|
327
|
-
def
|
|
328
|
-
return any(message.reindex for message in messages)
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
def needs_paragraphs_update(field_id: FieldID, messages: list[BrokerMessage]) -> bool:
|
|
330
|
+
def needs_paragraphs_update(field_id: FieldID, message: BrokerMessage) -> bool:
|
|
332
331
|
return (
|
|
333
|
-
has_paragraph_annotations(field_id,
|
|
334
|
-
or has_new_extracted_text(field_id,
|
|
335
|
-
or has_new_field_metadata(field_id,
|
|
332
|
+
has_paragraph_annotations(field_id, message)
|
|
333
|
+
or has_new_extracted_text(field_id, message)
|
|
334
|
+
or has_new_field_metadata(field_id, message)
|
|
336
335
|
)
|
|
337
336
|
|
|
338
337
|
|
|
339
|
-
def has_paragraph_annotations(field_id: FieldID,
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
if len(ufm.paragraphs) > 0:
|
|
348
|
-
return True
|
|
349
|
-
return False
|
|
338
|
+
def has_paragraph_annotations(field_id: FieldID, message: BrokerMessage) -> bool:
|
|
339
|
+
ufm = next(
|
|
340
|
+
(fm for fm in message.basic.fieldmetadata if fm.field == field_id),
|
|
341
|
+
None,
|
|
342
|
+
)
|
|
343
|
+
if ufm is None:
|
|
344
|
+
return False
|
|
345
|
+
return len(ufm.paragraphs) > 0
|
|
350
346
|
|
|
351
347
|
|
|
352
348
|
def has_new_field_metadata(
|
|
353
349
|
field_id: FieldID,
|
|
354
|
-
|
|
350
|
+
message: BrokerMessage,
|
|
355
351
|
) -> bool:
|
|
356
|
-
for
|
|
357
|
-
for field_metadata in message.field_metadata:
|
|
358
|
-
if field_metadata.field == field_id:
|
|
359
|
-
return True
|
|
360
|
-
return False
|
|
352
|
+
return any(field_metadata.field == field_id for field_metadata in message.field_metadata)
|
|
361
353
|
|
|
362
354
|
|
|
363
355
|
def has_new_extracted_text(
|
|
364
356
|
field_id: FieldID,
|
|
365
|
-
|
|
357
|
+
message: BrokerMessage,
|
|
366
358
|
) -> bool:
|
|
367
|
-
for
|
|
368
|
-
for extracted_text in message.extracted_text:
|
|
369
|
-
if extracted_text.field == field_id:
|
|
370
|
-
return True
|
|
371
|
-
return False
|
|
359
|
+
return any(extracted_text.field == field_id for extracted_text in message.extracted_text)
|
|
372
360
|
|
|
373
361
|
|
|
374
362
|
def needs_texts_update(
|
|
375
363
|
field_id: FieldID,
|
|
376
|
-
|
|
364
|
+
message: BrokerMessage,
|
|
377
365
|
) -> bool:
|
|
378
|
-
return has_new_extracted_text(field_id,
|
|
366
|
+
return has_new_extracted_text(field_id, message) or has_new_field_metadata(field_id, message)
|
|
379
367
|
|
|
380
368
|
|
|
381
369
|
def needs_vectors_update(
|
|
382
370
|
field_id: FieldID,
|
|
383
|
-
|
|
371
|
+
message: BrokerMessage,
|
|
384
372
|
) -> bool:
|
|
385
|
-
for
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
373
|
+
return any(field_vectors.field == field_id for field_vectors in message.field_vectors)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
async def get_bm_modified_split_ids(
|
|
377
|
+
conversation_field_id: FieldID,
|
|
378
|
+
message: BrokerMessage,
|
|
379
|
+
resource: Resource,
|
|
380
|
+
) -> set[str]:
|
|
381
|
+
message_etw = next(
|
|
382
|
+
(etw for etw in message.extracted_text if etw.field == conversation_field_id), None
|
|
383
|
+
)
|
|
384
|
+
if message_etw is None:
|
|
385
|
+
return set()
|
|
386
|
+
storage = resource.storage
|
|
387
|
+
if message_etw.HasField("file"):
|
|
388
|
+
raw_payload = await storage.downloadbytescf(message_etw.file)
|
|
389
|
+
message_extracted_text = ExtractedText()
|
|
390
|
+
message_extracted_text.ParseFromString(raw_payload.read())
|
|
391
|
+
raw_payload.flush()
|
|
392
|
+
else:
|
|
393
|
+
message_extracted_text = message_etw.body
|
|
394
|
+
return set(message_extracted_text.split_text.keys())
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
async def get_stored_split_ids(
|
|
398
|
+
conversation_field_id: FieldID,
|
|
399
|
+
resource: Resource,
|
|
400
|
+
) -> set[str]:
|
|
401
|
+
fid = conversation_field_id
|
|
402
|
+
conv: Conversation = await resource.get_field(fid.field, fid.field_type, load=False)
|
|
403
|
+
splits_metadata = await conv.get_splits_metadata()
|
|
404
|
+
return set(splits_metadata.metadata)
|
|
390
405
|
|
|
391
406
|
|
|
392
407
|
def needs_relations_update(
|
|
393
408
|
field_id: FieldID,
|
|
394
|
-
|
|
409
|
+
message: BrokerMessage,
|
|
395
410
|
) -> bool:
|
|
396
|
-
return has_new_field_metadata(field_id,
|
|
411
|
+
return has_new_field_metadata(field_id, message) or has_new_extracted_text(field_id, message)
|
|
397
412
|
|
|
398
413
|
|
|
399
414
|
async def get_resource_index_message(
|