nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show
  1. migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
  2. migrations/0017_multiple_writable_shards.py +1 -1
  3. migrations/0018_purge_orphan_kbslugs.py +1 -1
  4. migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
  5. migrations/0021_overwrite_vectorsets_key.py +1 -1
  6. migrations/0023_backfill_pg_catalog.py +7 -3
  7. migrations/0025_assign_models_to_kbs_v2.py +3 -3
  8. migrations/0027_rollover_texts3.py +1 -1
  9. migrations/0028_extracted_vectors_reference.py +1 -1
  10. migrations/0029_backfill_field_status.py +1 -1
  11. migrations/0032_remove_old_relations.py +1 -1
  12. migrations/0036_backfill_catalog_slug.py +1 -1
  13. migrations/0037_backfill_catalog_facets.py +1 -1
  14. migrations/0038_backfill_catalog_field_labels.py +7 -3
  15. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  16. migrations/0040_migrate_search_configurations.py +79 -0
  17. migrations/pg/0010_shards_index.py +34 -0
  18. nucliadb/backups/create.py +3 -3
  19. nucliadb/backups/restore.py +3 -3
  20. nucliadb/common/cache.py +1 -1
  21. nucliadb/common/catalog/__init__.py +79 -0
  22. nucliadb/common/catalog/dummy.py +36 -0
  23. nucliadb/common/catalog/interface.py +85 -0
  24. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  25. nucliadb/common/catalog/utils.py +56 -0
  26. nucliadb/common/cluster/manager.py +3 -19
  27. nucliadb/common/cluster/rebalance.py +484 -110
  28. nucliadb/common/cluster/rollover.py +29 -0
  29. nucliadb/common/cluster/settings.py +1 -1
  30. nucliadb/common/cluster/utils.py +26 -0
  31. nucliadb/common/datamanagers/atomic.py +6 -0
  32. nucliadb/common/datamanagers/utils.py +2 -2
  33. nucliadb/common/external_index_providers/manager.py +1 -29
  34. nucliadb/common/external_index_providers/settings.py +1 -27
  35. nucliadb/common/filter_expression.py +16 -33
  36. nucliadb/common/http_clients/exceptions.py +8 -0
  37. nucliadb/common/http_clients/processing.py +4 -0
  38. nucliadb/common/http_clients/utils.py +3 -0
  39. nucliadb/common/ids.py +77 -55
  40. nucliadb/common/locking.py +4 -4
  41. nucliadb/common/maindb/driver.py +11 -1
  42. nucliadb/common/maindb/local.py +1 -1
  43. nucliadb/common/maindb/pg.py +1 -1
  44. nucliadb/common/nidx.py +19 -1
  45. nucliadb/common/vector_index_config.py +1 -1
  46. nucliadb/export_import/datamanager.py +3 -3
  47. nucliadb/ingest/consumer/pull.py +7 -0
  48. nucliadb/ingest/consumer/service.py +2 -27
  49. nucliadb/ingest/consumer/shard_creator.py +17 -6
  50. nucliadb/ingest/fields/base.py +9 -17
  51. nucliadb/ingest/fields/conversation.py +47 -1
  52. nucliadb/ingest/orm/brain_v2.py +21 -3
  53. nucliadb/ingest/orm/index_message.py +126 -111
  54. nucliadb/ingest/orm/knowledgebox.py +84 -43
  55. nucliadb/ingest/orm/processor/auditing.py +1 -1
  56. nucliadb/ingest/orm/processor/processor.py +95 -149
  57. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  58. nucliadb/ingest/orm/resource.py +10 -1
  59. nucliadb/ingest/partitions.py +12 -1
  60. nucliadb/ingest/serialize.py +2 -2
  61. nucliadb/ingest/service/writer.py +26 -19
  62. nucliadb/ingest/settings.py +33 -11
  63. nucliadb/learning_proxy.py +12 -15
  64. nucliadb/metrics_exporter.py +17 -4
  65. nucliadb/migrator/datamanager.py +11 -17
  66. nucliadb/migrator/migrator.py +2 -2
  67. nucliadb/purge/__init__.py +12 -17
  68. nucliadb/purge/orphan_shards.py +2 -2
  69. nucliadb/reader/api/v1/knowledgebox.py +40 -12
  70. nucliadb/reader/api/v1/learning_config.py +30 -10
  71. nucliadb/reader/api/v1/resource.py +2 -2
  72. nucliadb/reader/api/v1/services.py +1 -1
  73. nucliadb/reader/reader/notifications.py +1 -1
  74. nucliadb/search/api/v1/__init__.py +1 -0
  75. nucliadb/search/api/v1/catalog.py +4 -4
  76. nucliadb/search/api/v1/find.py +1 -4
  77. nucliadb/search/api/v1/hydrate.py +328 -0
  78. nucliadb/search/api/v1/resource/ask.py +21 -1
  79. nucliadb/search/api/v1/search.py +1 -4
  80. nucliadb/search/predict.py +9 -2
  81. nucliadb/search/search/cache.py +1 -20
  82. nucliadb/search/search/chat/ask.py +50 -8
  83. nucliadb/search/search/chat/prompt.py +47 -15
  84. nucliadb/search/search/chat/query.py +8 -1
  85. nucliadb/search/search/fetch.py +1 -1
  86. nucliadb/search/search/find.py +1 -6
  87. nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
  88. nucliadb/search/search/hydrator/fields.py +175 -0
  89. nucliadb/search/search/hydrator/images.py +130 -0
  90. nucliadb/search/search/hydrator/paragraphs.py +307 -0
  91. nucliadb/search/search/hydrator/resources.py +56 -0
  92. nucliadb/search/search/metrics.py +16 -0
  93. nucliadb/search/search/predict_proxy.py +33 -11
  94. nucliadb/search/search/query.py +0 -23
  95. nucliadb/search/search/query_parser/fetcher.py +5 -5
  96. nucliadb/search/search/query_parser/models.py +1 -30
  97. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  98. nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
  99. nucliadb/search/search/query_parser/parsers/common.py +16 -7
  100. nucliadb/search/search/query_parser/parsers/find.py +0 -11
  101. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  102. nucliadb/search/search/query_parser/parsers/search.py +0 -11
  103. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
  104. nucliadb/search/search/rerankers.py +1 -1
  105. nucliadb/search/search/summarize.py +1 -1
  106. nucliadb/standalone/run.py +3 -0
  107. nucliadb/tasks/retries.py +4 -4
  108. nucliadb/train/generators/sentence_classifier.py +2 -8
  109. nucliadb/train/generators/utils.py +1 -1
  110. nucliadb/train/nodes.py +4 -4
  111. nucliadb/train/servicer.py +1 -1
  112. nucliadb/train/uploader.py +1 -1
  113. nucliadb/writer/api/v1/field.py +14 -9
  114. nucliadb/writer/api/v1/knowledgebox.py +15 -52
  115. nucliadb/writer/api/v1/learning_config.py +5 -4
  116. nucliadb/writer/api/v1/resource.py +2 -2
  117. nucliadb/writer/resource/field.py +38 -2
  118. nucliadb/writer/tus/azure.py +4 -4
  119. nucliadb/writer/tus/gcs.py +11 -17
  120. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
  121. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
  122. nucliadb/common/external_index_providers/pinecone.py +0 -894
  123. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  124. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
  125. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
  126. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
@@ -54,7 +54,7 @@ class ExportImportDataManager:
54
54
 
55
55
  async def get_metadata(self, type: str, kbid: str, id: str) -> Metadata:
56
56
  key = self._get_maindb_metadata_key(type, kbid, id)
57
- async with self.driver.transaction(read_only=True) as txn:
57
+ async with self.driver.ro_transaction() as txn:
58
58
  data = await txn.get(key)
59
59
  if data is None or data == b"":
60
60
  raise MetadataNotFound()
@@ -89,13 +89,13 @@ class ExportImportDataManager:
89
89
  metadata.modified = datetime.now(timezone.utc)
90
90
  key = self._get_maindb_metadata_key(type, metadata.kbid, metadata.id)
91
91
  data = metadata.model_dump_json().encode("utf-8")
92
- async with self.driver.transaction() as txn:
92
+ async with self.driver.rw_transaction() as txn:
93
93
  await txn.set(key, data)
94
94
  await txn.commit()
95
95
 
96
96
  async def delete_metadata(self, type: str, metadata: Metadata):
97
97
  key = self._get_maindb_metadata_key(type, metadata.kbid, metadata.id)
98
- async with self.driver.transaction() as txn:
98
+ async with self.driver.rw_transaction() as txn:
99
99
  await txn.delete(key)
100
100
  await txn.commit()
101
101
 
@@ -31,6 +31,7 @@ from opentelemetry.trace import (
31
31
  Link,
32
32
  )
33
33
 
34
+ from nucliadb.common.http_clients.exceptions import ServiceUnavailableException
34
35
  from nucliadb.common.http_clients.processing import (
35
36
  ProcessingHTTPClient,
36
37
  ProcessingPullMessageProgressUpdater,
@@ -209,6 +210,12 @@ class PullV2Worker:
209
210
  payload_length = len(base64.b64decode(data.payload))
210
211
  logger.error(f"Message too big for transaction: {payload_length}")
211
212
  raise e
213
+
214
+ except ServiceUnavailableException as ex:
215
+ logger.warning(f"Processing api is unavailable, will retry shortly: {ex}")
216
+ await processing_http_client.reset_session()
217
+ await asyncio.sleep(self.pull_time_error_backoff)
218
+
212
219
  except Exception:
213
220
  logger.exception("Unhandled error pulling messages from processing")
214
221
  await asyncio.sleep(self.pull_time_error_backoff)
@@ -22,21 +22,18 @@ import sys
22
22
  from functools import partial
23
23
  from typing import Awaitable, Callable, Optional
24
24
 
25
- from nucliadb.common.back_pressure.materializer import BackPressureMaterializer
26
- from nucliadb.common.back_pressure.settings import settings as back_pressure_settings
27
25
  from nucliadb.common.maindb.utils import setup_driver
28
26
  from nucliadb.ingest import SERVICE_NAME, logger
29
27
  from nucliadb.ingest.consumer.consumer import IngestConsumer
30
28
  from nucliadb.ingest.consumer.pull import PullV2Worker
31
29
  from nucliadb.ingest.settings import settings
32
30
  from nucliadb_utils.exceptions import ConfigurationError
33
- from nucliadb_utils.settings import indexing_settings, transaction_settings
31
+ from nucliadb_utils.settings import transaction_settings
34
32
  from nucliadb_utils.utilities import (
35
33
  get_audit,
36
34
  get_nats_manager,
37
35
  get_pubsub,
38
36
  get_storage,
39
- start_nats_manager,
40
37
  )
41
38
 
42
39
  from .auditing import IndexAuditHandler, ResourceWritesAuditHandler
@@ -57,27 +54,6 @@ async def _exit_tasks(tasks: list[asyncio.Task]) -> None:
57
54
  await asyncio.gather(*tasks, return_exceptions=True)
58
55
 
59
56
 
60
- async def start_back_pressure() -> BackPressureMaterializer:
61
- logger.info("Starting back pressure materializer")
62
- nats_manager = await start_nats_manager(
63
- SERVICE_NAME,
64
- indexing_settings.index_jetstream_servers,
65
- indexing_settings.index_jetstream_auth,
66
- )
67
- back_pressure = BackPressureMaterializer(
68
- nats_manager,
69
- indexing_check_interval=back_pressure_settings.indexing_check_interval,
70
- ingest_check_interval=back_pressure_settings.ingest_check_interval,
71
- )
72
- await back_pressure.start()
73
- return back_pressure
74
-
75
-
76
- async def stop_back_pressure(materializer: BackPressureMaterializer) -> None:
77
- await materializer.stop()
78
- await materializer.nats_manager.finalize()
79
-
80
-
81
57
  async def start_ingest_consumers(
82
58
  service_name: Optional[str] = None,
83
59
  ) -> Callable[[], Awaitable[None]]:
@@ -164,9 +140,8 @@ async def start_shard_creator() -> Callable[[], Awaitable[None]]:
164
140
  driver = await setup_driver()
165
141
  pubsub = await get_pubsub()
166
142
  assert pubsub is not None, "Pubsub is not configured"
167
- storage = await get_storage(service_name=SERVICE_NAME)
168
143
 
169
- shard_creator = ShardCreatorHandler(driver=driver, storage=storage, pubsub=pubsub)
144
+ shard_creator = ShardCreatorHandler(driver=driver, pubsub=pubsub)
170
145
  await shard_creator.initialize()
171
146
 
172
147
  return shard_creator.finalize
@@ -25,14 +25,14 @@ from typing import Any
25
25
 
26
26
  from nidx_protos import nodereader_pb2, noderesources_pb2
27
27
 
28
- from nucliadb.common import locking
28
+ from nucliadb.common import datamanagers, locking
29
+ from nucliadb.common.cluster.settings import settings
29
30
  from nucliadb.common.cluster.utils import get_shard_manager
30
31
  from nucliadb.common.maindb.driver import Driver
31
32
  from nucliadb.common.nidx import get_nidx_api_client
32
33
  from nucliadb_protos import writer_pb2
33
34
  from nucliadb_utils import const
34
35
  from nucliadb_utils.cache.pubsub import PubSubDriver
35
- from nucliadb_utils.storages.storage import Storage
36
36
 
37
37
  from . import metrics
38
38
  from .utils import DelayedTaskHandler
@@ -52,12 +52,10 @@ class ShardCreatorHandler:
52
52
  self,
53
53
  *,
54
54
  driver: Driver,
55
- storage: Storage,
56
55
  pubsub: PubSubDriver,
57
56
  check_delay: float = 10.0,
58
57
  ):
59
58
  self.driver = driver
60
- self.storage = storage
61
59
  self.pubsub = pubsub
62
60
  self.shard_manager = get_shard_manager()
63
61
  self.task_handler = DelayedTaskHandler(check_delay)
@@ -91,7 +89,7 @@ class ShardCreatorHandler:
91
89
  @metrics.handler_histo.wrap({"type": "shard_creator"})
92
90
  async def process_kb(self, kbid: str) -> None:
93
91
  logger.info({"message": "Processing notification for kbid", "kbid": kbid})
94
- async with self.driver.transaction(read_only=True) as txn:
92
+ async with self.driver.ro_transaction() as txn:
95
93
  current_shard = await self.shard_manager.get_current_active_shard(txn, kbid)
96
94
 
97
95
  if current_shard is None:
@@ -111,4 +109,17 @@ class ShardCreatorHandler:
111
109
  shard_id=noderesources_pb2.ShardId(id=current_shard.nidx_shard_id)
112
110
  ) # type: ignore
113
111
  )
114
- await self.shard_manager.maybe_create_new_shard(kbid, shard.paragraphs)
112
+
113
+ if not should_create_new_shard(shard.paragraphs):
114
+ return
115
+
116
+ logger.info({"message": "Adding shard", "kbid": kbid})
117
+ async with datamanagers.with_rw_transaction() as txn:
118
+ kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
119
+ prewarm = kb_config is not None and kb_config.prewarm_enabled
120
+ await self.shard_manager.create_shard_by_kbid(txn, kbid, prewarm_enabled=prewarm)
121
+ await txn.commit()
122
+
123
+
124
+ def should_create_new_shard(num_paragraphs: int) -> bool:
125
+ return num_paragraphs > settings.max_shard_paragraphs
@@ -29,6 +29,7 @@ from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
29
29
  from google.protobuf.message import DecodeError, Message
30
30
 
31
31
  from nucliadb.common import datamanagers
32
+ from nucliadb.common.ids import FieldId
32
33
  from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
33
34
  from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
34
35
  from nucliadb_protos.resources_pb2 import (
@@ -46,10 +47,8 @@ from nucliadb_protos.resources_pb2 import (
46
47
  )
47
48
  from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
48
49
  from nucliadb_protos.writer_pb2 import Error, FieldStatus
49
- from nucliadb_utils import const
50
50
  from nucliadb_utils.storages.exceptions import CouldNotCopyNotFound
51
51
  from nucliadb_utils.storages.storage import Storage, StorageField
52
- from nucliadb_utils.utilities import has_feature
53
52
 
54
53
  logger = logging.getLogger(__name__)
55
54
 
@@ -125,6 +124,14 @@ class Field(Generic[PbType]):
125
124
  def uuid(self) -> str:
126
125
  return self.resource.uuid
127
126
 
127
+ @property
128
+ def field_id(self) -> FieldId:
129
+ return FieldId(
130
+ rid=self.resource.uuid,
131
+ type=self.type,
132
+ key=self.id,
133
+ )
134
+
128
135
  @property
129
136
  def storage(self) -> Storage:
130
137
  return self.resource.storage
@@ -215,21 +222,6 @@ class Field(Generic[PbType]):
215
222
  ) -> None:
216
223
  # Try delete vectors
217
224
  sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
218
-
219
- if has_feature(const.Features.DEBUG_MISSING_VECTORS):
220
- # This is a very chatty log. It is just a temporary hint while debugging an issue.
221
- logger.info(
222
- "Deleting vectors from storage",
223
- extra={
224
- "kbid": self.kbid,
225
- "rid": self.resource.uuid,
226
- "field": f"{self.type}/{self.id}",
227
- "vectorset": vectorset,
228
- "storage_key_kind": storage_key_kind,
229
- "key": sf.key,
230
- "bucket": sf.bucket,
231
- },
232
- )
233
225
  try:
234
226
  await self.storage.delete_upload(sf.key, sf.bucket)
235
227
  except KeyError:
@@ -21,13 +21,16 @@ import uuid
21
21
  from typing import Any, Optional
22
22
 
23
23
  from nucliadb.ingest.fields.base import Field
24
- from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation
24
+ from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation, SplitMetadata, SplitsMetadata
25
25
  from nucliadb_protos.resources_pb2 import Conversation as PBConversation
26
26
  from nucliadb_utils.storages.storage import StorageField
27
27
 
28
+ MAX_CONVERSATION_MESSAGES = 50 * 1024
29
+
28
30
  PAGE_SIZE = 200
29
31
 
30
32
  CONVERSATION_PAGE_VALUE = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/{page}"
33
+ CONVERSATION_SPLITS_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/splits_metadata"
31
34
  CONVERSATION_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
32
35
 
33
36
 
@@ -52,9 +55,22 @@ class Conversation(Field[PBConversation]):
52
55
  ):
53
56
  super(Conversation, self).__init__(id, resource, pb, value)
54
57
  self.value = {}
58
+ self._splits_metadata: Optional[SplitsMetadata] = None
59
+ self.metadata = None
60
+
61
+ async def delete_value(self):
62
+ await self.resource.txn.delete_by_prefix(
63
+ CONVERSATION_METADATA.format(kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id)
64
+ )
65
+ self._split_metadata = None
55
66
  self.metadata = None
67
+ self.value.clear()
56
68
 
57
69
  async def set_value(self, payload: PBConversation):
70
+ if payload.replace_field:
71
+ # As we need to overwrite the value of the conversation, first delete any previous data.
72
+ await self.delete_value()
73
+
58
74
  metadata = await self.get_metadata()
59
75
  metadata.extract_strategy = payload.extract_strategy
60
76
  metadata.split_strategy = payload.split_strategy
@@ -70,10 +86,13 @@ class Conversation(Field[PBConversation]):
70
86
  last_page = PBConversation()
71
87
  metadata.pages += 1
72
88
 
89
+ self._splits_metadata = await self.get_splits_metadata()
90
+
73
91
  # Make sure message attachment files are on our region. This is needed
74
92
  # to support the hybrid-onprem deployment as the attachments must be stored
75
93
  # at the storage services of the client's premises.
76
94
  for message in payload.messages:
95
+ self._splits_metadata.metadata.setdefault(message.ident, SplitMetadata())
77
96
  new_message_files = []
78
97
  for idx, file in enumerate(message.content.attachments):
79
98
  if self.storage.needs_move(file, self.kbid):
@@ -117,6 +136,7 @@ class Conversation(Field[PBConversation]):
117
136
 
118
137
  # Finally, set the metadata
119
138
  await self.db_set_metadata(metadata)
139
+ await self.set_splits_metadata(self._splits_metadata)
120
140
 
121
141
  async def get_value(self, page: Optional[int] = None) -> Optional[PBConversation]:
122
142
  # If no page was requested, force fetch of metadata
@@ -203,3 +223,29 @@ class Conversation(Field[PBConversation]):
203
223
  self.metadata = payload
204
224
  self.resource.modified = True
205
225
  self._created = False
226
+
227
+ async def get_splits_metadata(self) -> SplitsMetadata:
228
+ if self._splits_metadata is None:
229
+ field_key = CONVERSATION_SPLITS_METADATA.format(
230
+ kbid=self.kbid,
231
+ uuid=self.uuid,
232
+ type=self.type,
233
+ field=self.id,
234
+ )
235
+ payload = await self.resource.txn.get(field_key)
236
+ if payload is None:
237
+ return SplitsMetadata()
238
+ self._splits_metadata = SplitsMetadata()
239
+ self._splits_metadata.ParseFromString(payload)
240
+ return self._splits_metadata
241
+
242
+ async def set_splits_metadata(self, payload: SplitsMetadata) -> None:
243
+ key = CONVERSATION_SPLITS_METADATA.format(
244
+ kbid=self.kbid,
245
+ uuid=self.uuid,
246
+ type=self.type,
247
+ field=self.id,
248
+ )
249
+ await self.resource.txn.set(key, payload.SerializeToString())
250
+ self._split_metadata = payload
251
+ self.resource.modified = True
@@ -193,7 +193,7 @@ class ResourceBrain:
193
193
  if field_author is not None and field_author.WhichOneof("author") == "data_augmentation":
194
194
  field_type, field_id = field_key.split("/")
195
195
  da_task_id = ids.extract_data_augmentation_id(field_id)
196
- if da_task_id is None: # pragma: nocover
196
+ if da_task_id is None: # pragma: no cover
197
197
  logger.warning(
198
198
  "Data augmentation field id has an unexpected format! Skipping label",
199
199
  extra={
@@ -217,6 +217,7 @@ class ResourceBrain:
217
217
  replace_field: bool,
218
218
  skip_paragraphs_index: Optional[bool],
219
219
  skip_texts_index: Optional[bool],
220
+ append_splits: Optional[set[str]] = None,
220
221
  ) -> None:
221
222
  # We need to add the extracted text to the texts section of the Resource so that
222
223
  # the paragraphs can be indexed
@@ -234,6 +235,7 @@ class ResourceBrain:
234
235
  user_field_metadata,
235
236
  replace_field=replace_field,
236
237
  skip_paragraphs=skip_paragraphs_index,
238
+ append_splits=append_splits,
237
239
  )
238
240
 
239
241
  @observer.wrap({"type": "apply_field_paragraphs"})
@@ -246,6 +248,7 @@ class ResourceBrain:
246
248
  user_field_metadata: Optional[UserFieldMetadata],
247
249
  replace_field: bool,
248
250
  skip_paragraphs: Optional[bool],
251
+ append_splits: Optional[set[str]] = None,
249
252
  ) -> None:
250
253
  if skip_paragraphs is not None:
251
254
  self.brain.skip_paragraphs = skip_paragraphs
@@ -254,7 +257,12 @@ class ResourceBrain:
254
257
  paragraph_pages = ParagraphPages(page_positions) if page_positions else None
255
258
  # Splits of the field
256
259
  for subfield, field_metadata in field_computed_metadata.split_metadata.items():
257
- extracted_text_str = extracted_text.split_text[subfield] if extracted_text else None
260
+ if should_skip_split_indexing(subfield, replace_field, append_splits):
261
+ continue
262
+ if subfield not in extracted_text.split_text:
263
+ # No extracted text for this split
264
+ continue
265
+ extracted_text_str = extracted_text.split_text[subfield]
258
266
  for idx, paragraph in enumerate(field_metadata.paragraphs):
259
267
  key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
260
268
  denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
@@ -308,7 +316,7 @@ class ResourceBrain:
308
316
  self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
309
317
 
310
318
  # Main field
311
- extracted_text_str = extracted_text.text if extracted_text else None
319
+ extracted_text_str = extracted_text.text
312
320
  for idx, paragraph in enumerate(field_computed_metadata.metadata.paragraphs):
313
321
  key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
314
322
  denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
@@ -496,9 +504,12 @@ class ResourceBrain:
496
504
  replace_field: bool = False,
497
505
  # cut to specific dimension if specified
498
506
  vector_dimension: Optional[int] = None,
507
+ append_splits: Optional[set[str]] = None,
499
508
  ):
500
509
  fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
501
510
  for subfield, vectors in vo.split_vectors.items():
511
+ if should_skip_split_indexing(subfield, replace_field, append_splits):
512
+ continue
502
513
  _field_id = ids.FieldId(
503
514
  rid=fid.rid,
504
515
  type=fid.type,
@@ -792,3 +803,10 @@ class ParagraphPages:
792
803
  if len(self._materialized) > 0:
793
804
  return self._materialized[-1]
794
805
  return 0
806
+
807
+
808
+ def should_skip_split_indexing(
809
+ split: str, replace_field: bool, append_splits: Optional[set[str]]
810
+ ) -> bool:
811
+ # When replacing the whole field, reindex all splits. Otherwise, we're only indexing the splits that are appended
812
+ return not replace_field and append_splits is not None and split not in append_splits