nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -18,16 +18,19 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import uuid
|
|
21
|
-
from typing import Any
|
|
21
|
+
from typing import Any
|
|
22
22
|
|
|
23
23
|
from nucliadb.ingest.fields.base import Field
|
|
24
|
-
from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation
|
|
24
|
+
from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation, SplitsMetadata
|
|
25
25
|
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
|
26
26
|
from nucliadb_utils.storages.storage import StorageField
|
|
27
27
|
|
|
28
|
+
MAX_CONVERSATION_MESSAGES = None # No limit
|
|
29
|
+
|
|
28
30
|
PAGE_SIZE = 200
|
|
29
31
|
|
|
30
32
|
CONVERSATION_PAGE_VALUE = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/{page}"
|
|
33
|
+
CONVERSATION_SPLITS_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/splits_metadata"
|
|
31
34
|
CONVERSATION_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
|
|
32
35
|
|
|
33
36
|
|
|
@@ -39,7 +42,7 @@ class Conversation(Field[PBConversation]):
|
|
|
39
42
|
pbklass = PBConversation
|
|
40
43
|
type: str = "c"
|
|
41
44
|
value: dict[int, PBConversation]
|
|
42
|
-
metadata:
|
|
45
|
+
metadata: FieldConversation | None
|
|
43
46
|
|
|
44
47
|
_created: bool = False
|
|
45
48
|
|
|
@@ -47,20 +50,33 @@ class Conversation(Field[PBConversation]):
|
|
|
47
50
|
self,
|
|
48
51
|
id: str,
|
|
49
52
|
resource: Any,
|
|
50
|
-
pb:
|
|
51
|
-
value:
|
|
53
|
+
pb: Any | None = None,
|
|
54
|
+
value: dict[int, PBConversation] | None = None,
|
|
52
55
|
):
|
|
53
|
-
super(
|
|
56
|
+
super().__init__(id, resource, pb, value)
|
|
54
57
|
self.value = {}
|
|
58
|
+
self._splits_metadata: SplitsMetadata | None = None
|
|
59
|
+
self.metadata = None
|
|
60
|
+
|
|
61
|
+
async def delete_value(self):
|
|
62
|
+
await self.resource.txn.delete_by_prefix(
|
|
63
|
+
CONVERSATION_METADATA.format(kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id)
|
|
64
|
+
)
|
|
65
|
+
self._split_metadata = None
|
|
55
66
|
self.metadata = None
|
|
67
|
+
self.value.clear()
|
|
56
68
|
|
|
57
69
|
async def set_value(self, payload: PBConversation):
|
|
70
|
+
if payload.replace_field:
|
|
71
|
+
# As we need to overwrite the value of the conversation, first delete any previous data.
|
|
72
|
+
await self.delete_value()
|
|
73
|
+
|
|
58
74
|
metadata = await self.get_metadata()
|
|
59
75
|
metadata.extract_strategy = payload.extract_strategy
|
|
60
76
|
metadata.split_strategy = payload.split_strategy
|
|
61
77
|
|
|
62
78
|
# Get the last page if it exists
|
|
63
|
-
last_page:
|
|
79
|
+
last_page: PBConversation | None = None
|
|
64
80
|
if self._created is False and metadata.pages > 0:
|
|
65
81
|
try:
|
|
66
82
|
last_page = await self.db_get_value(page=metadata.pages)
|
|
@@ -70,10 +86,13 @@ class Conversation(Field[PBConversation]):
|
|
|
70
86
|
last_page = PBConversation()
|
|
71
87
|
metadata.pages += 1
|
|
72
88
|
|
|
89
|
+
self._splits_metadata = await self.get_splits_metadata()
|
|
90
|
+
|
|
73
91
|
# Make sure message attachment files are on our region. This is needed
|
|
74
92
|
# to support the hybrid-onprem deployment as the attachments must be stored
|
|
75
93
|
# at the storage services of the client's premises.
|
|
76
94
|
for message in payload.messages:
|
|
95
|
+
self._splits_metadata.metadata.get_or_create(message.ident)
|
|
77
96
|
new_message_files = []
|
|
78
97
|
for idx, file in enumerate(message.content.attachments):
|
|
79
98
|
if self.storage.needs_move(file, self.kbid):
|
|
@@ -117,8 +136,9 @@ class Conversation(Field[PBConversation]):
|
|
|
117
136
|
|
|
118
137
|
# Finally, set the metadata
|
|
119
138
|
await self.db_set_metadata(metadata)
|
|
139
|
+
await self.set_splits_metadata(self._splits_metadata)
|
|
120
140
|
|
|
121
|
-
async def get_value(self, page:
|
|
141
|
+
async def get_value(self, page: int | None = None) -> PBConversation | None:
|
|
122
142
|
# If no page was requested, force fetch of metadata
|
|
123
143
|
# and set the page to the last page
|
|
124
144
|
if page is None and self.metadata is None:
|
|
@@ -133,7 +153,7 @@ class Conversation(Field[PBConversation]):
|
|
|
133
153
|
except PageNotFound:
|
|
134
154
|
return None
|
|
135
155
|
|
|
136
|
-
async def get_full_conversation(self) ->
|
|
156
|
+
async def get_full_conversation(self) -> PBConversation | None:
|
|
137
157
|
"""
|
|
138
158
|
Messages of a conversations may be stored across several pages.
|
|
139
159
|
This method fetches them all and returns a single complete conversation.
|
|
@@ -203,3 +223,29 @@ class Conversation(Field[PBConversation]):
|
|
|
203
223
|
self.metadata = payload
|
|
204
224
|
self.resource.modified = True
|
|
205
225
|
self._created = False
|
|
226
|
+
|
|
227
|
+
async def get_splits_metadata(self) -> SplitsMetadata:
|
|
228
|
+
if self._splits_metadata is None:
|
|
229
|
+
field_key = CONVERSATION_SPLITS_METADATA.format(
|
|
230
|
+
kbid=self.kbid,
|
|
231
|
+
uuid=self.uuid,
|
|
232
|
+
type=self.type,
|
|
233
|
+
field=self.id,
|
|
234
|
+
)
|
|
235
|
+
payload = await self.resource.txn.get(field_key)
|
|
236
|
+
if payload is None:
|
|
237
|
+
return SplitsMetadata()
|
|
238
|
+
self._splits_metadata = SplitsMetadata()
|
|
239
|
+
self._splits_metadata.ParseFromString(payload)
|
|
240
|
+
return self._splits_metadata
|
|
241
|
+
|
|
242
|
+
async def set_splits_metadata(self, payload: SplitsMetadata) -> None:
|
|
243
|
+
key = CONVERSATION_SPLITS_METADATA.format(
|
|
244
|
+
kbid=self.kbid,
|
|
245
|
+
uuid=self.uuid,
|
|
246
|
+
type=self.type,
|
|
247
|
+
field=self.id,
|
|
248
|
+
)
|
|
249
|
+
await self.resource.txn.set(key, payload.SerializeToString())
|
|
250
|
+
self._split_metadata = payload
|
|
251
|
+
self.resource.modified = True
|
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Type
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
class InvalidFieldClass(Exception):
|
|
@@ -25,7 +24,7 @@ class InvalidFieldClass(Exception):
|
|
|
25
24
|
|
|
26
25
|
|
|
27
26
|
class InvalidPBClass(Exception):
|
|
28
|
-
def __init__(self, source:
|
|
27
|
+
def __init__(self, source: type, destination: type):
|
|
29
28
|
self.source = source
|
|
30
29
|
self.destination = destination
|
|
31
30
|
super().__init__(f"Source and destination does not match {self.source} - {self.destination}")
|
nucliadb/ingest/fields/file.py
CHANGED
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Any
|
|
20
|
+
from typing import Any
|
|
21
21
|
|
|
22
22
|
from nucliadb.ingest.fields.base import Field
|
|
23
23
|
from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FileExtractedData
|
|
@@ -30,22 +30,22 @@ class File(Field[FieldFile]):
|
|
|
30
30
|
pbklass = FieldFile
|
|
31
31
|
value: FieldFile
|
|
32
32
|
type: str = "f"
|
|
33
|
-
file_extracted_data:
|
|
33
|
+
file_extracted_data: FileExtractedData | None
|
|
34
34
|
|
|
35
35
|
def __init__(
|
|
36
36
|
self,
|
|
37
37
|
id: str,
|
|
38
38
|
resource: Any,
|
|
39
|
-
pb:
|
|
40
|
-
value:
|
|
39
|
+
pb: Any | None = None,
|
|
40
|
+
value: str | None = None,
|
|
41
41
|
):
|
|
42
|
-
super(
|
|
42
|
+
super().__init__(id, resource, pb, value)
|
|
43
43
|
self.file_extracted_data = None
|
|
44
44
|
|
|
45
45
|
async def set_value(self, payload: FieldFile):
|
|
46
46
|
old_file = await self.get_value()
|
|
47
47
|
if old_file is None:
|
|
48
|
-
old_cf:
|
|
48
|
+
old_cf: CloudFile | None = None
|
|
49
49
|
else:
|
|
50
50
|
old_cf = old_file.file
|
|
51
51
|
|
|
@@ -57,7 +57,7 @@ class File(Field[FieldFile]):
|
|
|
57
57
|
|
|
58
58
|
await self.db_set_value(payload)
|
|
59
59
|
|
|
60
|
-
async def get_value(self) ->
|
|
60
|
+
async def get_value(self) -> FieldFile | None:
|
|
61
61
|
return await self.db_get_value()
|
|
62
62
|
|
|
63
63
|
async def set_file_extracted_data(self, file_extracted_data: FileExtractedData):
|
|
@@ -101,10 +101,24 @@ class File(Field[FieldFile]):
|
|
|
101
101
|
await self.storage.upload_pb(sf, file_extracted_data)
|
|
102
102
|
self.file_extracted_data = file_extracted_data
|
|
103
103
|
|
|
104
|
-
async def get_file_extracted_data(self) ->
|
|
104
|
+
async def get_file_extracted_data(self) -> FileExtractedData | None:
|
|
105
105
|
if self.file_extracted_data is None:
|
|
106
106
|
sf: StorageField = self.storage.file_extracted(
|
|
107
107
|
self.kbid, self.uuid, self.type, self.id, FILE_METADATA
|
|
108
108
|
)
|
|
109
109
|
self.file_extracted_data = await self.storage.download_pb(sf, FileExtractedData)
|
|
110
110
|
return self.file_extracted_data
|
|
111
|
+
|
|
112
|
+
async def thumbnail(self) -> StorageField | None:
|
|
113
|
+
"""Access the file field thumbnail."""
|
|
114
|
+
fed = await self.get_file_extracted_data()
|
|
115
|
+
if fed is None:
|
|
116
|
+
return None
|
|
117
|
+
if not fed.HasField("file_thumbnail"):
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
sf: StorageField = self.storage.file_extracted(
|
|
121
|
+
self.kbid, self.uuid, self.type, self.id, "file_thumbnail"
|
|
122
|
+
)
|
|
123
|
+
sf.field = fed.file_thumbnail
|
|
124
|
+
return sf
|
nucliadb/ingest/fields/link.py
CHANGED
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Any
|
|
20
|
+
from typing import Any
|
|
21
21
|
|
|
22
22
|
from nucliadb.ingest.fields.base import Field
|
|
23
23
|
from nucliadb_protos.resources_pb2 import CloudFile, FieldLink, LinkExtractedData
|
|
@@ -30,22 +30,22 @@ class Link(Field[FieldLink]):
|
|
|
30
30
|
pbklass = FieldLink
|
|
31
31
|
value: FieldLink
|
|
32
32
|
type: str = "u"
|
|
33
|
-
link_extracted_data:
|
|
33
|
+
link_extracted_data: LinkExtractedData | None
|
|
34
34
|
|
|
35
35
|
def __init__(
|
|
36
36
|
self,
|
|
37
37
|
id: str,
|
|
38
38
|
resource: Any,
|
|
39
|
-
pb:
|
|
40
|
-
value:
|
|
39
|
+
pb: Any | None = None,
|
|
40
|
+
value: str | None = None,
|
|
41
41
|
):
|
|
42
|
-
super(
|
|
42
|
+
super().__init__(id, resource, pb, value)
|
|
43
43
|
self.link_extracted_data = None
|
|
44
44
|
|
|
45
45
|
async def set_value(self, payload: FieldLink):
|
|
46
46
|
await self.db_set_value(payload)
|
|
47
47
|
|
|
48
|
-
async def get_value(self) ->
|
|
48
|
+
async def get_value(self) -> FieldLink | None:
|
|
49
49
|
return await self.db_get_value()
|
|
50
50
|
|
|
51
51
|
async def set_link_extracted_data(self, link_extracted_data: LinkExtractedData):
|
|
@@ -88,7 +88,7 @@ class Link(Field[FieldLink]):
|
|
|
88
88
|
await self.storage.upload_pb(sf, link_extracted_data)
|
|
89
89
|
self.link_extracted_data = link_extracted_data
|
|
90
90
|
|
|
91
|
-
async def get_link_extracted_data(self) ->
|
|
91
|
+
async def get_link_extracted_data(self) -> LinkExtractedData | None:
|
|
92
92
|
if self.link_extracted_data is None:
|
|
93
93
|
sf: StorageField = self.storage.file_extracted(
|
|
94
94
|
self.kbid, self.uuid, self.type, self.id, LINK_METADATA
|
nucliadb/ingest/fields/text.py
CHANGED
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
|
|
21
21
|
import hashlib
|
|
22
|
-
from typing import Optional
|
|
23
22
|
|
|
24
23
|
from nucliadb.ingest.fields.base import Field
|
|
25
24
|
from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
|
|
@@ -39,8 +38,8 @@ class Text(Field[FieldText]):
|
|
|
39
38
|
|
|
40
39
|
async def set_value(self, payload: FieldText):
|
|
41
40
|
if payload.md5 == "":
|
|
42
|
-
payload.md5 = hashlib.md5(payload.body.encode()).hexdigest()
|
|
41
|
+
payload.md5 = hashlib.md5(payload.body.encode(), usedforsecurity=False).hexdigest()
|
|
43
42
|
await self.db_set_value(payload)
|
|
44
43
|
|
|
45
|
-
async def get_value(self) ->
|
|
44
|
+
async def get_value(self) -> FieldText | None:
|
|
46
45
|
return await self.db_get_value()
|
nucliadb/ingest/orm/brain_v2.py
CHANGED
|
@@ -18,9 +18,9 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import logging
|
|
21
|
+
from collections.abc import Iterator
|
|
21
22
|
from copy import deepcopy
|
|
22
23
|
from dataclasses import dataclass
|
|
23
|
-
from typing import Optional
|
|
24
24
|
|
|
25
25
|
from nidx_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
|
|
26
26
|
from nidx_protos.noderesources_pb2 import (
|
|
@@ -81,9 +81,9 @@ class ResourceBrain:
|
|
|
81
81
|
self,
|
|
82
82
|
basic: Basic,
|
|
83
83
|
user_relations: Relations,
|
|
84
|
-
origin:
|
|
85
|
-
previous_processing_status:
|
|
86
|
-
security:
|
|
84
|
+
origin: Origin | None,
|
|
85
|
+
previous_processing_status: Metadata.Status.ValueType | None,
|
|
86
|
+
security: utils_pb2.Security | None,
|
|
87
87
|
) -> None:
|
|
88
88
|
self._set_resource_status(basic, previous_processing_status)
|
|
89
89
|
self._set_resource_dates(basic, origin)
|
|
@@ -97,9 +97,9 @@ class ResourceBrain:
|
|
|
97
97
|
self,
|
|
98
98
|
field_key: str,
|
|
99
99
|
extracted_text: ExtractedText,
|
|
100
|
-
field_computed_metadata:
|
|
101
|
-
basic_user_metadata:
|
|
102
|
-
field_author:
|
|
100
|
+
field_computed_metadata: FieldComputedMetadata | None,
|
|
101
|
+
basic_user_metadata: UserMetadata | None,
|
|
102
|
+
field_author: FieldAuthor | None,
|
|
103
103
|
replace_field: bool,
|
|
104
104
|
skip_index: bool,
|
|
105
105
|
) -> None:
|
|
@@ -122,13 +122,17 @@ class ResourceBrain:
|
|
|
122
122
|
field_key: str,
|
|
123
123
|
extracted_text: ExtractedText,
|
|
124
124
|
replace_field: bool,
|
|
125
|
-
skip_texts:
|
|
125
|
+
skip_texts: bool | None,
|
|
126
126
|
):
|
|
127
127
|
if skip_texts is not None:
|
|
128
128
|
self.brain.skip_texts = skip_texts
|
|
129
|
+
|
|
129
130
|
field_text = extracted_text.text
|
|
130
|
-
|
|
131
|
-
|
|
131
|
+
|
|
132
|
+
for split_id in self.sorted_splits(extracted_text):
|
|
133
|
+
split_text = extracted_text.split_text[split_id]
|
|
134
|
+
field_text += f"{split_text} "
|
|
135
|
+
|
|
132
136
|
self.brain.texts[field_key].text = field_text
|
|
133
137
|
|
|
134
138
|
if replace_field:
|
|
@@ -140,18 +144,16 @@ class ResourceBrain:
|
|
|
140
144
|
def apply_field_labels(
|
|
141
145
|
self,
|
|
142
146
|
field_key: str,
|
|
143
|
-
field_computed_metadata:
|
|
144
|
-
field_author:
|
|
145
|
-
basic_user_metadata:
|
|
147
|
+
field_computed_metadata: FieldComputedMetadata | None,
|
|
148
|
+
field_author: FieldAuthor | None,
|
|
149
|
+
basic_user_metadata: UserMetadata | None = None,
|
|
146
150
|
):
|
|
147
151
|
user_cancelled_labels: set[str] = (
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
]
|
|
154
|
-
)
|
|
152
|
+
{
|
|
153
|
+
f"{classification.labelset}/{classification.label}"
|
|
154
|
+
for classification in basic_user_metadata.classifications
|
|
155
|
+
if classification.cancelled_by_user
|
|
156
|
+
}
|
|
155
157
|
if basic_user_metadata
|
|
156
158
|
else set()
|
|
157
159
|
)
|
|
@@ -193,7 +195,7 @@ class ResourceBrain:
|
|
|
193
195
|
if field_author is not None and field_author.WhichOneof("author") == "data_augmentation":
|
|
194
196
|
field_type, field_id = field_key.split("/")
|
|
195
197
|
da_task_id = ids.extract_data_augmentation_id(field_id)
|
|
196
|
-
if da_task_id is None: # pragma:
|
|
198
|
+
if da_task_id is None: # pragma: no cover
|
|
197
199
|
logger.warning(
|
|
198
200
|
"Data augmentation field id has an unexpected format! Skipping label",
|
|
199
201
|
extra={
|
|
@@ -212,12 +214,17 @@ class ResourceBrain:
|
|
|
212
214
|
field_key: str,
|
|
213
215
|
field_computed_metadata: FieldComputedMetadata,
|
|
214
216
|
extracted_text: ExtractedText,
|
|
215
|
-
page_positions:
|
|
216
|
-
user_field_metadata:
|
|
217
|
+
page_positions: FilePagePositions | None,
|
|
218
|
+
user_field_metadata: UserFieldMetadata | None,
|
|
217
219
|
replace_field: bool,
|
|
218
|
-
skip_paragraphs_index:
|
|
219
|
-
skip_texts_index:
|
|
220
|
+
skip_paragraphs_index: bool | None,
|
|
221
|
+
skip_texts_index: bool | None,
|
|
222
|
+
append_splits: set[str] | None = None,
|
|
220
223
|
) -> None:
|
|
224
|
+
"""
|
|
225
|
+
append_splits: when provided, only the splits in this set will be indexed. This is used for conversation appends, to
|
|
226
|
+
avoid reindexing all previous messages of the conversation.
|
|
227
|
+
"""
|
|
221
228
|
# We need to add the extracted text to the texts section of the Resource so that
|
|
222
229
|
# the paragraphs can be indexed
|
|
223
230
|
self.apply_field_text(
|
|
@@ -234,27 +241,45 @@ class ResourceBrain:
|
|
|
234
241
|
user_field_metadata,
|
|
235
242
|
replace_field=replace_field,
|
|
236
243
|
skip_paragraphs=skip_paragraphs_index,
|
|
244
|
+
append_splits=append_splits,
|
|
237
245
|
)
|
|
238
246
|
|
|
247
|
+
def sorted_splits(self, extracted_text: ExtractedText) -> Iterator[str]:
|
|
248
|
+
yield from sorted(extracted_text.split_text.keys())
|
|
249
|
+
|
|
239
250
|
@observer.wrap({"type": "apply_field_paragraphs"})
|
|
240
251
|
def apply_field_paragraphs(
|
|
241
252
|
self,
|
|
242
253
|
field_key: str,
|
|
243
254
|
field_computed_metadata: FieldComputedMetadata,
|
|
244
255
|
extracted_text: ExtractedText,
|
|
245
|
-
page_positions:
|
|
246
|
-
user_field_metadata:
|
|
256
|
+
page_positions: FilePagePositions | None,
|
|
257
|
+
user_field_metadata: UserFieldMetadata | None,
|
|
247
258
|
replace_field: bool,
|
|
248
|
-
skip_paragraphs:
|
|
259
|
+
skip_paragraphs: bool | None,
|
|
260
|
+
append_splits: set[str] | None = None,
|
|
249
261
|
) -> None:
|
|
250
262
|
if skip_paragraphs is not None:
|
|
251
263
|
self.brain.skip_paragraphs = skip_paragraphs
|
|
252
264
|
unique_paragraphs: set[str] = set()
|
|
253
265
|
user_paragraph_classifications = self._get_paragraph_user_classifications(user_field_metadata)
|
|
254
266
|
paragraph_pages = ParagraphPages(page_positions) if page_positions else None
|
|
267
|
+
|
|
255
268
|
# Splits of the field
|
|
256
|
-
|
|
257
|
-
|
|
269
|
+
|
|
270
|
+
# Used to adjust the paragraph start/end when indexing splits, as they are all
|
|
271
|
+
# concatenated in the main text part of the brain Resource.
|
|
272
|
+
split_offset = 0
|
|
273
|
+
for subfield in self.sorted_splits(extracted_text):
|
|
274
|
+
if subfield not in field_computed_metadata.split_metadata or should_skip_split_indexing(
|
|
275
|
+
subfield, replace_field, append_splits
|
|
276
|
+
):
|
|
277
|
+
# We're skipping this split but we need to adjust the offset as we have added the text
|
|
278
|
+
# of this split to the main text
|
|
279
|
+
split_offset += len(extracted_text.split_text[subfield]) + 1 # +1 for the space
|
|
280
|
+
continue
|
|
281
|
+
field_metadata = field_computed_metadata.split_metadata[subfield]
|
|
282
|
+
extracted_text_str = extracted_text.split_text[subfield]
|
|
258
283
|
for idx, paragraph in enumerate(field_metadata.paragraphs):
|
|
259
284
|
key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
|
260
285
|
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
|
@@ -280,8 +305,8 @@ class ResourceBrain:
|
|
|
280
305
|
representation.file = paragraph.representation.reference_file
|
|
281
306
|
representation.is_a_table = paragraph.representation.is_a_table
|
|
282
307
|
p = BrainParagraph(
|
|
283
|
-
start=paragraph.start,
|
|
284
|
-
end=paragraph.end,
|
|
308
|
+
start=paragraph.start + split_offset,
|
|
309
|
+
end=paragraph.end + split_offset,
|
|
285
310
|
field=field_key,
|
|
286
311
|
split=subfield,
|
|
287
312
|
index=idx,
|
|
@@ -296,6 +321,7 @@ class ResourceBrain:
|
|
|
296
321
|
representation=representation,
|
|
297
322
|
),
|
|
298
323
|
)
|
|
324
|
+
split_offset = p.end + 1 # +1 for the space
|
|
299
325
|
paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
|
|
300
326
|
paragraph_labels = {paragraph_kind_label}
|
|
301
327
|
paragraph_labels.update(
|
|
@@ -308,7 +334,7 @@ class ResourceBrain:
|
|
|
308
334
|
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
|
309
335
|
|
|
310
336
|
# Main field
|
|
311
|
-
extracted_text_str = extracted_text.text
|
|
337
|
+
extracted_text_str = extracted_text.text
|
|
312
338
|
for idx, paragraph in enumerate(field_computed_metadata.metadata.paragraphs):
|
|
313
339
|
key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
|
314
340
|
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
|
@@ -364,7 +390,7 @@ class ResourceBrain:
|
|
|
364
390
|
self.brain.paragraphs_to_delete.append(full_field_id)
|
|
365
391
|
|
|
366
392
|
def _get_paragraph_user_classifications(
|
|
367
|
-
self, basic_user_field_metadata:
|
|
393
|
+
self, basic_user_field_metadata: UserFieldMetadata | None
|
|
368
394
|
) -> ParagraphClassifications:
|
|
369
395
|
pc = ParagraphClassifications(valid={}, denied={})
|
|
370
396
|
if basic_user_field_metadata is None:
|
|
@@ -383,18 +409,16 @@ class ResourceBrain:
|
|
|
383
409
|
def generate_relations(
|
|
384
410
|
self,
|
|
385
411
|
field_key: str,
|
|
386
|
-
field_computed_metadata:
|
|
387
|
-
basic_user_metadata:
|
|
412
|
+
field_computed_metadata: FieldComputedMetadata | None,
|
|
413
|
+
basic_user_metadata: UserMetadata | None,
|
|
388
414
|
replace_field: bool,
|
|
389
415
|
) -> None:
|
|
390
416
|
user_cancelled_labels: set[str] = (
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
]
|
|
397
|
-
)
|
|
417
|
+
{
|
|
418
|
+
f"{classification.labelset}/{classification.label}"
|
|
419
|
+
for classification in basic_user_metadata.classifications
|
|
420
|
+
if classification.cancelled_by_user
|
|
421
|
+
}
|
|
398
422
|
if basic_user_metadata
|
|
399
423
|
else set()
|
|
400
424
|
)
|
|
@@ -483,7 +507,7 @@ class ResourceBrain:
|
|
|
483
507
|
full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
|
|
484
508
|
self.brain.texts_to_delete.append(full_field_id)
|
|
485
509
|
self.brain.paragraphs_to_delete.append(full_field_id)
|
|
486
|
-
self.brain.
|
|
510
|
+
self.brain.vectors_to_delete_in_all_vectorsets.append(full_field_id)
|
|
487
511
|
self.brain.relation_fields_to_delete.append(field_key)
|
|
488
512
|
|
|
489
513
|
@observer.wrap({"type": "generate_vectors"})
|
|
@@ -495,10 +519,13 @@ class ResourceBrain:
|
|
|
495
519
|
vectorset: str,
|
|
496
520
|
replace_field: bool = False,
|
|
497
521
|
# cut to specific dimension if specified
|
|
498
|
-
vector_dimension:
|
|
522
|
+
vector_dimension: int | None = None,
|
|
523
|
+
append_splits: set[str] | None = None,
|
|
499
524
|
):
|
|
500
525
|
fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
|
|
501
526
|
for subfield, vectors in vo.split_vectors.items():
|
|
527
|
+
if should_skip_split_indexing(subfield, replace_field, append_splits):
|
|
528
|
+
continue
|
|
502
529
|
_field_id = ids.FieldId(
|
|
503
530
|
rid=fid.rid,
|
|
504
531
|
type=fid.type,
|
|
@@ -567,7 +594,7 @@ class ResourceBrain:
|
|
|
567
594
|
*,
|
|
568
595
|
vectorset: str,
|
|
569
596
|
# cut vectors if a specific dimension is specified
|
|
570
|
-
vector_dimension:
|
|
597
|
+
vector_dimension: int | None = None,
|
|
571
598
|
):
|
|
572
599
|
paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
|
|
573
600
|
sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
|
|
@@ -592,7 +619,7 @@ class ResourceBrain:
|
|
|
592
619
|
|
|
593
620
|
sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
|
|
594
621
|
|
|
595
|
-
def _set_resource_status(self, basic: Basic, previous_status:
|
|
622
|
+
def _set_resource_status(self, basic: Basic, previous_status: Metadata.Status.ValueType | None):
|
|
596
623
|
"""
|
|
597
624
|
We purposefully overwrite what we index as a status and DO NOT reflect
|
|
598
625
|
actual status with what we index.
|
|
@@ -622,32 +649,32 @@ class ResourceBrain:
|
|
|
622
649
|
return "EMPTY"
|
|
623
650
|
return METADATA_STATUS_PB_TYPE_TO_NAME_MAP[metadata.status]
|
|
624
651
|
|
|
625
|
-
def _set_resource_dates(self, basic: Basic, origin:
|
|
652
|
+
def _set_resource_dates(self, basic: Basic, origin: Origin | None):
|
|
626
653
|
"""
|
|
627
654
|
Adds the user-defined dates to the brain object. This is at resource level and applies to
|
|
628
655
|
all fields of the resource.
|
|
629
656
|
"""
|
|
630
|
-
if basic.created.seconds
|
|
657
|
+
if basic.created.seconds != 0:
|
|
631
658
|
self.brain.metadata.created.CopyFrom(basic.created)
|
|
632
659
|
else:
|
|
633
660
|
logging.warning(f"Basic metadata has no created field for {self.rid}")
|
|
634
661
|
self.brain.metadata.created.GetCurrentTime()
|
|
635
|
-
if basic.modified.seconds
|
|
662
|
+
if basic.modified.seconds != 0:
|
|
636
663
|
self.brain.metadata.modified.CopyFrom(basic.modified)
|
|
637
664
|
else:
|
|
638
|
-
if basic.created.seconds
|
|
665
|
+
if basic.created.seconds != 0:
|
|
639
666
|
self.brain.metadata.modified.CopyFrom(basic.created)
|
|
640
667
|
else:
|
|
641
668
|
self.brain.metadata.modified.GetCurrentTime()
|
|
642
669
|
|
|
643
670
|
if origin is not None:
|
|
644
671
|
# overwrite created/modified if provided on origin
|
|
645
|
-
if origin.HasField("created") and origin.created.seconds
|
|
672
|
+
if origin.HasField("created") and origin.created.seconds != 0:
|
|
646
673
|
self.brain.metadata.created.CopyFrom(origin.created)
|
|
647
|
-
if origin.HasField("modified") and origin.modified.seconds
|
|
674
|
+
if origin.HasField("modified") and origin.modified.seconds != 0:
|
|
648
675
|
self.brain.metadata.modified.CopyFrom(origin.modified)
|
|
649
676
|
|
|
650
|
-
def _set_resource_relations(self, basic: Basic, origin:
|
|
677
|
+
def _set_resource_relations(self, basic: Basic, origin: Origin | None, user_relations: Relations):
|
|
651
678
|
"""
|
|
652
679
|
Adds the relations to the brain object corresponding to the user-defined metadata at the resource level:
|
|
653
680
|
- Contributors of the document
|
|
@@ -691,7 +718,7 @@ class ResourceBrain:
|
|
|
691
718
|
|
|
692
719
|
self.brain.relation_fields_to_delete.append("a/metadata")
|
|
693
720
|
|
|
694
|
-
def _set_resource_labels(self, basic: Basic, origin:
|
|
721
|
+
def _set_resource_labels(self, basic: Basic, origin: Origin | None):
|
|
695
722
|
"""
|
|
696
723
|
Adds the resource-level labels to the brain object.
|
|
697
724
|
These levels are user-defined in basic or origin metadata.
|
|
@@ -748,7 +775,7 @@ class ResourceBrain:
|
|
|
748
775
|
|
|
749
776
|
def is_paragraph_repeated_in_field(
|
|
750
777
|
paragraph: Paragraph,
|
|
751
|
-
extracted_text:
|
|
778
|
+
extracted_text: str | None,
|
|
752
779
|
unique_paragraphs: set[str],
|
|
753
780
|
) -> bool:
|
|
754
781
|
if extracted_text is None:
|
|
@@ -787,8 +814,13 @@ class ParagraphPages:
|
|
|
787
814
|
return self._materialized[paragraph_start_index]
|
|
788
815
|
except IndexError:
|
|
789
816
|
logger.error(
|
|
790
|
-
f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}"
|
|
817
|
+
f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}"
|
|
791
818
|
)
|
|
792
819
|
if len(self._materialized) > 0:
|
|
793
820
|
return self._materialized[-1]
|
|
794
821
|
return 0
|
|
822
|
+
|
|
823
|
+
|
|
824
|
+
def should_skip_split_indexing(split: str, replace_field: bool, append_splits: set[str] | None) -> bool:
|
|
825
|
+
# When replacing the whole field, reindex all splits. Otherwise, we're only indexing the splits that are appended
|
|
826
|
+
return not replace_field and append_splits is not None and split not in append_splits
|
|
@@ -56,7 +56,7 @@ class _BrokerMessageBuilder:
|
|
|
56
56
|
# clear the state and generate a new broker message
|
|
57
57
|
self.bm.Clear()
|
|
58
58
|
|
|
59
|
-
self.bm.kbid = resource.
|
|
59
|
+
self.bm.kbid = resource.kbid
|
|
60
60
|
self.bm.uuid = resource.uuid
|
|
61
61
|
basic = await resource.get_basic()
|
|
62
62
|
if basic is not None:
|
|
@@ -93,9 +93,7 @@ class _BrokerMessageBuilder:
|
|
|
93
93
|
self.bm.link_extracted_data.append(link_extracted_data)
|
|
94
94
|
|
|
95
95
|
# Field vectors
|
|
96
|
-
async for vectorset_id, vs in datamanagers.vectorsets.iter(
|
|
97
|
-
resource.txn, kbid=resource.kb.kbid
|
|
98
|
-
):
|
|
96
|
+
async for vectorset_id, vs in datamanagers.vectorsets.iter(resource.txn, kbid=resource.kbid):
|
|
99
97
|
await self.generate_field_vectors(
|
|
100
98
|
type_id, field_id, field, vectorset_id, vs.storage_key_kind
|
|
101
99
|
)
|