nucliadb 6.7.0.post4800__py3-none-any.whl → 6.7.0.post4805__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -332,37 +332,54 @@ class Field(Generic[PbType]):
332
332
  self.question_answers = actual_payload
333
333
 
334
334
  async def set_extracted_text(self, payload: ExtractedTextWrapper) -> None:
335
+ actual_payload: Optional[ExtractedText] = None
335
336
  if self.type in SUBFIELDFIELDS:
337
+ # Try to get the previously extracted text protobuf if it exists so we can merge it with the new splits
338
+ # coming from the processing payload.
336
339
  try:
337
- actual_payload: Optional[ExtractedText] = await self.get_extracted_text(force=True)
340
+ actual_payload = await self.get_extracted_text(force=True)
338
341
  except KeyError:
339
- actual_payload = None
340
- else:
341
- actual_payload = None
342
- sf = self.get_storage_field(FieldTypes.FIELD_TEXT)
342
+ # No previous extracted text found
343
+ pass
343
344
 
345
+ sf = self.get_storage_field(FieldTypes.FIELD_TEXT)
344
346
  if actual_payload is None:
345
- # Its first extracted text
347
+ # No previous extracted text, this is the first time we set it so we can simply upload it to storage
346
348
  if payload.HasField("file"):
349
+ # Normalize the storage key if the payload is a reference to a file in storage.
350
+ # This is typically the case when the text is too large and we store it in a
351
+ # cloud file. Normalization is needed to ensure that the hybrid-onprem deployment stores
352
+ # the file in the correct bucket of its storage.
347
353
  await self.storage.normalize_binary(payload.file, sf)
348
354
  else:
355
+ # Directly upload the ExtractedText protobuf to storage
349
356
  await self.storage.upload_pb(sf, payload.body)
350
357
  self.extracted_text = payload.body
351
358
  else:
352
359
  if payload.HasField("file"):
360
+ # The extracted text coming from processing has a reference to another storage key.
361
+ # Download it and copy it to its ExtractedText.body field. This is typically for cases
362
+ # when the text is too large.
353
363
  raw_payload = await self.storage.downloadbytescf(payload.file)
354
364
  pb = ExtractedText()
355
365
  pb.ParseFromString(raw_payload.read())
356
366
  raw_payload.flush()
357
367
  payload.body.CopyFrom(pb)
358
- # We know its payload.body
368
+
369
+ # Update or set the extracted text text for each split coming from the processing payload
359
370
  for key, value in payload.body.split_text.items():
360
371
  actual_payload.split_text[key] = value
372
+
373
+ # Apply any split deletions that may come in the processing payload
361
374
  for key in payload.body.deleted_splits:
362
375
  if key in actual_payload.split_text:
363
376
  del actual_payload.split_text[key]
377
+
378
+ # Finally, handle the main text body (for the cases where the text is not split)
364
379
  if payload.body.text != "":
365
380
  actual_payload.text = payload.body.text
381
+
382
+ # Upload the updated ExtractedText to storage
366
383
  await self.storage.upload_pb(sf, actual_payload)
367
384
  self.extracted_text = actual_payload
368
385
 
@@ -26,8 +26,9 @@ from nucliadb_protos.resources_pb2 import Conversation as PBConversation
26
26
  from nucliadb_utils.storages.storage import StorageField
27
27
 
28
28
  PAGE_SIZE = 200
29
- KB_RESOURCE_FIELD = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/{page}"
30
- KB_RESOURCE_FIELD_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
29
+
30
+ CONVERSATION_PAGE_VALUE = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/{page}"
31
+ CONVERSATION_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
31
32
 
32
33
 
33
34
  class PageNotFound(Exception):
@@ -54,56 +55,65 @@ class Conversation(Field[PBConversation]):
54
55
  self.metadata = None
55
56
 
56
57
  async def set_value(self, payload: PBConversation):
57
- last_page: Optional[PBConversation] = None
58
58
  metadata = await self.get_metadata()
59
59
  metadata.extract_strategy = payload.extract_strategy
60
60
  metadata.split_strategy = payload.split_strategy
61
61
 
62
+ # Get the last page if it exists
63
+ last_page: Optional[PBConversation] = None
62
64
  if self._created is False and metadata.pages > 0:
63
65
  try:
64
66
  last_page = await self.db_get_value(page=metadata.pages)
65
67
  except PageNotFound:
66
68
  pass
69
+ if last_page is None:
70
+ last_page = PBConversation()
71
+ metadata.pages += 1
67
72
 
68
- # Make sure message attachment files are on our region
73
+ # Make sure message attachment files are on our region. This is needed
74
+ # to support the hybrid-onprem deployment as the attachments must be stored
75
+ # at the storage services of the client's premises.
69
76
  for message in payload.messages:
70
77
  new_message_files = []
71
- for count, file in enumerate(message.content.attachments):
78
+ for idx, file in enumerate(message.content.attachments):
72
79
  if self.storage.needs_move(file, self.kbid):
73
80
  if message.ident == "":
74
81
  message_ident = uuid.uuid4().hex
75
82
  else:
76
83
  message_ident = message.ident
77
- sf: StorageField = self.storage.conversation_field(
78
- self.kbid, self.uuid, self.id, message_ident, count
84
+ sf: StorageField = self.storage.conversation_field_attachment(
85
+ self.kbid, self.uuid, self.id, message_ident, attachment_index=idx
79
86
  )
80
87
  cf: CloudFile = await self.storage.normalize_binary(file, sf)
81
88
  new_message_files.append(cf)
82
89
  else:
83
90
  new_message_files.append(file)
84
91
 
85
- # Can be cleaned a list of PB
92
+ # Replace the attachments in the message with the new ones
86
93
  message.content.ClearField("attachments")
87
94
  for message_file in new_message_files:
88
95
  message.content.attachments.append(message_file)
89
96
 
90
- if last_page is None:
91
- last_page = PBConversation()
92
- metadata.pages += 1
93
-
94
- # Merge on last page
97
+ # Increment the metadata total with the number of messages
95
98
  messages = list(payload.messages)
96
99
  metadata.total += len(messages)
100
+
101
+ # Store the messages in pages of PAGE_SIZE messages
97
102
  while len(messages) > 0:
98
- count = metadata.size - len(last_page.messages)
99
- last_page.messages.extend(messages[:count])
103
+ # Fit the messages in the last page
104
+ available_space = metadata.size - len(last_page.messages)
105
+ last_page.messages.extend(messages[:available_space])
106
+
107
+ # Save the last page
100
108
  await self.db_set_value(last_page, metadata.pages)
101
109
 
102
- messages = messages[count:]
110
+ # If there are still messages, create a new page
111
+ messages = messages[available_space:]
103
112
  if len(messages) > 0:
104
113
  metadata.pages += 1
105
114
  last_page = PBConversation()
106
115
 
116
+ # Finally, set the metadata
107
117
  await self.db_set_metadata(metadata)
108
118
 
109
119
  async def get_value(self, page: Optional[int] = None) -> Optional[PBConversation]:
@@ -139,7 +149,7 @@ class Conversation(Field[PBConversation]):
139
149
  async def get_metadata(self) -> FieldConversation:
140
150
  if self.metadata is None:
141
151
  payload = await self.resource.txn.get(
142
- KB_RESOURCE_FIELD_METADATA.format(
152
+ CONVERSATION_METADATA.format(
143
153
  kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id
144
154
  )
145
155
  )
@@ -158,7 +168,7 @@ class Conversation(Field[PBConversation]):
158
168
  raise ValueError(f"Conversation pages start at index 1")
159
169
 
160
170
  if self.value.get(page) is None:
161
- field_key = KB_RESOURCE_FIELD.format(
171
+ field_key = CONVERSATION_PAGE_VALUE.format(
162
172
  kbid=self.kbid,
163
173
  uuid=self.uuid,
164
174
  type=self.type,
@@ -174,7 +184,7 @@ class Conversation(Field[PBConversation]):
174
184
  return self.value[page]
175
185
 
176
186
  async def db_set_value(self, payload: PBConversation, page: int = 0):
177
- field_key = KB_RESOURCE_FIELD.format(
187
+ field_key = CONVERSATION_PAGE_VALUE.format(
178
188
  kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id, page=page
179
189
  )
180
190
  await self.resource.txn.set(
@@ -186,9 +196,7 @@ class Conversation(Field[PBConversation]):
186
196
 
187
197
  async def db_set_metadata(self, payload: FieldConversation):
188
198
  await self.resource.txn.set(
189
- KB_RESOURCE_FIELD_METADATA.format(
190
- kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id
191
- ),
199
+ CONVERSATION_METADATA.format(kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id),
192
200
  payload.SerializeToString(),
193
201
  )
194
202
  self.metadata = payload
@@ -174,7 +174,9 @@ async def download_field_conversation_rslug_prefix(
174
174
  message_id: str,
175
175
  file_num: int,
176
176
  ) -> Response:
177
- return await _download_field_conversation(request, kbid, field_id, message_id, file_num, rslug=rslug)
177
+ return await _download_field_conversation_attachment(
178
+ request, kbid, field_id, message_id, file_num, rslug=rslug
179
+ )
178
180
 
179
181
 
180
182
  @api.get(
@@ -185,7 +187,7 @@ async def download_field_conversation_rslug_prefix(
185
187
  )
186
188
  @requires_one([NucliaDBRoles.READER])
187
189
  @version(1)
188
- async def download_field_conversation_rid_prefix(
190
+ async def download_field_conversation_attachment_rid_prefix(
189
191
  request: Request,
190
192
  kbid: str,
191
193
  rid: str,
@@ -193,10 +195,12 @@ async def download_field_conversation_rid_prefix(
193
195
  message_id: str,
194
196
  file_num: int,
195
197
  ) -> Response:
196
- return await _download_field_conversation(request, kbid, field_id, message_id, file_num, rid=rid)
198
+ return await _download_field_conversation_attachment(
199
+ request, kbid, field_id, message_id, file_num, rid=rid
200
+ )
197
201
 
198
202
 
199
- async def _download_field_conversation(
203
+ async def _download_field_conversation_attachment(
200
204
  request: Request,
201
205
  kbid: str,
202
206
  field_id: str,
@@ -209,7 +213,9 @@ async def _download_field_conversation(
209
213
 
210
214
  storage = await get_storage(service_name=SERVICE_NAME)
211
215
 
212
- sf = storage.conversation_field(kbid, rid, field_id, message_id, file_num)
216
+ sf = storage.conversation_field_attachment(
217
+ kbid, rid, field_id, message_id, attachment_index=file_num
218
+ )
213
219
 
214
220
  return await download_api(sf, request.headers)
215
221
 
@@ -466,9 +466,9 @@ async def parse_conversation_field(
466
466
  ]
467
467
  )
468
468
 
469
- for count, file in enumerate(message.content.attachments):
470
- sf_conv_field: StorageField = storage.conversation_field(
471
- kbid, uuid, field=key, ident=message.ident, count=count
469
+ for idx, file in enumerate(message.content.attachments):
470
+ sf_conv_field: StorageField = storage.conversation_field_attachment(
471
+ kbid, uuid, field=key, ident=message.ident, attachment_index=idx
472
472
  )
473
473
  cf_conv_field = await storage.upload_b64file_to_cloudfile(
474
474
  sf_conv_field,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.7.0.post4800
3
+ Version: 6.7.0.post4805
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -19,11 +19,11 @@ Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: <4,>=3.9
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: nucliadb-telemetry[all]>=6.7.0.post4800
23
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.7.0.post4800
24
- Requires-Dist: nucliadb-protos>=6.7.0.post4800
25
- Requires-Dist: nucliadb-models>=6.7.0.post4800
26
- Requires-Dist: nidx-protos>=6.7.0.post4800
22
+ Requires-Dist: nucliadb-telemetry[all]>=6.7.0.post4805
23
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.7.0.post4805
24
+ Requires-Dist: nucliadb-protos>=6.7.0.post4805
25
+ Requires-Dist: nucliadb-models>=6.7.0.post4805
26
+ Requires-Dist: nidx-protos>=6.7.0.post4805
27
27
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
28
28
  Requires-Dist: nuclia-models>=0.46.0
29
29
  Requires-Dist: uvicorn[standard]
@@ -148,8 +148,8 @@ nucliadb/ingest/consumer/service.py,sha256=8AD41mMN7EUeUtk4ZNy14zfvxzwmVjIX6Mwe0
148
148
  nucliadb/ingest/consumer/shard_creator.py,sha256=w0smEu01FU_2cjZnsfBRNqT_Ntho11X17zTMST-vKbc,4359
149
149
  nucliadb/ingest/consumer/utils.py,sha256=jpX8D4lKzuPCpArQLZeX_Zczq3pfen_zAf8sPJfOEZU,2642
150
150
  nucliadb/ingest/fields/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
151
- nucliadb/ingest/fields/base.py,sha256=vYLGB-8SRYnFIHSZBSm20iXZDAzlwuBiJQC8s3BQv6w,22958
152
- nucliadb/ingest/fields/conversation.py,sha256=PGOr7l6cvg-l1HMr-5UIJq53R6W93F5uew0fOrSufPI,7140
151
+ nucliadb/ingest/fields/base.py,sha256=O9rMKZmSdZmcxtuO6qUXjeoAkmUR5A-xga49Jyn6caI,24192
152
+ nucliadb/ingest/fields/conversation.py,sha256=W68Y7riDXbwavpE4jKslLNAjdqCVaaR2a-BisZYayeo,7675
153
153
  nucliadb/ingest/fields/exceptions.py,sha256=sZBk21BSrXFdOdo1qUdCAyD-9YMYakSLdn4_WdIPCIQ,1217
154
154
  nucliadb/ingest/fields/file.py,sha256=1v4jLg3balUua2VmSV8hHkAwPFShTUCOzufZvIUQcQw,4740
155
155
  nucliadb/ingest/fields/generic.py,sha256=elgtqv15aJUq3zY7X_g0bli_2BpcwPArVvzhe54Y4Ig,1547
@@ -199,7 +199,7 @@ nucliadb/reader/run.py,sha256=AR-iCnON3YVXgI5-KEgg99G4KAPN1BKXDg7nr4dgoDA,1447
199
199
  nucliadb/reader/api/__init__.py,sha256=c-UD29C0FVzQDGEvslebDCKtvnyEcAbiDd-3Q_QgGN4,872
200
200
  nucliadb/reader/api/models.py,sha256=UHhOPmh8xcHhDjFm8_-8t66yEggXdxRBoY0xV-hI9to,2076
201
201
  nucliadb/reader/api/v1/__init__.py,sha256=ieP8lsCCwG66Jupv8II5MSTj6nh3eCtLcF4utH9JOcU,1102
202
- nucliadb/reader/api/v1/download.py,sha256=rGv1c5CjrJheDgGwAnNWy76A_4V2shqqHYvwmKGFlpk,10758
202
+ nucliadb/reader/api/v1/download.py,sha256=A4ntoyPbLQskORrVSGnnsMAmCZiQ-Mufi5j2u4lXeQQ,10872
203
203
  nucliadb/reader/api/v1/export_import.py,sha256=x4VBNDFjnlY1nIt5kdq0eZTB_DeRzGzT8T7uB7wUhNU,6448
204
204
  nucliadb/reader/api/v1/knowledgebox.py,sha256=Uu-yPB8KKZt1VaFrFNMMaXOvLsclBJDK9dzZ9lF2ctI,3645
205
205
  nucliadb/reader/api/v1/learning_config.py,sha256=t_KqQBBbhpo0m6nQTkYmNdZsLVmW53SLcHMrCWiQMrk,6536
@@ -365,7 +365,7 @@ nucliadb/writer/api/v1/vectorsets.py,sha256=F3iMViL5G95_Tns4aO2SOA0DwAzxK2_P8MXx
365
365
  nucliadb/writer/resource/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
366
366
  nucliadb/writer/resource/audit.py,sha256=FvxMZPzrNHtd31HgpZEvxzwAkbxJTZRhPLqRYYJi3tA,1426
367
367
  nucliadb/writer/resource/basic.py,sha256=44GK8M9EEVoAUfGiabdLrrpENqeFwNn7qwxF2AHhQGg,10504
368
- nucliadb/writer/resource/field.py,sha256=me3Ig-11evXEJl3UNJsmZyHbs_VZYMa-WIKOHZPsO3E,21175
368
+ nucliadb/writer/resource/field.py,sha256=1fHm2N-3LcgQDE3BUDiJ0ToCiwY1Di7UbA6fcJs8BIk,21193
369
369
  nucliadb/writer/resource/origin.py,sha256=pvhUDdU0mlWPUcpoQi4LDUJaRtfjzVVrA8XcGVI_N8k,2021
370
370
  nucliadb/writer/tus/__init__.py,sha256=Kera0BtxoDX0ngPftXiMjNgjrhtQ3l2XFc5nJqSBOJY,5498
371
371
  nucliadb/writer/tus/azure.py,sha256=XhWAlWTM0vmXcXtuEPYjjeEhuZjiZXZu8q9WsJ7omFE,4107
@@ -376,8 +376,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
376
376
  nucliadb/writer/tus/s3.py,sha256=vu1BGg4VqJ_x2P1u2BxqPKlSfw5orT_a3R-Ln5oPUpU,8483
377
377
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
378
378
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
379
- nucliadb-6.7.0.post4800.dist-info/METADATA,sha256=p5YogXctzlcnUaCPjJowa0_MXNaVlxDYJdow5vfEqrs,4158
380
- nucliadb-6.7.0.post4800.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
381
- nucliadb-6.7.0.post4800.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
382
- nucliadb-6.7.0.post4800.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
383
- nucliadb-6.7.0.post4800.dist-info/RECORD,,
379
+ nucliadb-6.7.0.post4805.dist-info/METADATA,sha256=uVCY9B5etvUvVS9QDwmPSGBJfJdGiMnOe0HRsHqh0JU,4158
380
+ nucliadb-6.7.0.post4805.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
381
+ nucliadb-6.7.0.post4805.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
382
+ nucliadb-6.7.0.post4805.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
383
+ nucliadb-6.7.0.post4805.dist-info/RECORD,,