nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -25,7 +25,7 @@ import uuid
25
25
  from collections import defaultdict
26
26
  from contextlib import AsyncExitStack
27
27
  from enum import Enum
28
- from typing import Any, Optional
28
+ from typing import Any
29
29
 
30
30
  import aiohttp
31
31
  import backoff
@@ -132,19 +132,19 @@ def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> Process
132
132
  class ProcessingEngine:
133
133
  def __init__(
134
134
  self,
135
- nuclia_service_account: Optional[str] = None,
136
- nuclia_zone: Optional[str] = None,
137
- nuclia_public_url: Optional[str] = None,
138
- nuclia_processing_cluster_url: Optional[str] = None,
139
- onprem: Optional[bool] = False,
140
- nuclia_jwt_key: Optional[str] = None,
135
+ nuclia_service_account: str | None = None,
136
+ nuclia_zone: str | None = None,
137
+ nuclia_public_url: str | None = None,
138
+ nuclia_processing_cluster_url: str | None = None,
139
+ onprem: bool | None = False,
140
+ nuclia_jwt_key: str | None = None,
141
141
  days_to_keep: int = 3,
142
142
  driver: FileBackendConfig = FileBackendConfig.GCS,
143
143
  ):
144
144
  self.nuclia_service_account = nuclia_service_account
145
145
  self.nuclia_zone = nuclia_zone
146
146
  if nuclia_public_url is not None:
147
- self.nuclia_public_url: Optional[str] = nuclia_public_url.format(zone=nuclia_zone)
147
+ self.nuclia_public_url: str | None = nuclia_public_url.format(zone=nuclia_zone)
148
148
  else:
149
149
  self.nuclia_public_url = None
150
150
 
@@ -196,7 +196,7 @@ class ProcessingEngine:
196
196
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
197
197
 
198
198
  def generate_file_token_from_fieldfile(
199
- self, file: FieldFilePB, classif_labels: Optional[list[ClassificationLabel]] = None
199
+ self, file: FieldFilePB, classif_labels: list[ClassificationLabel] | None = None
200
200
  ) -> str:
201
201
  if self.nuclia_jwt_key is None:
202
202
  raise AttributeError("Nuclia JWT key not set")
@@ -235,7 +235,7 @@ class ProcessingEngine:
235
235
  )
236
236
  @processing_observer.wrap({"type": "file_field_upload"})
237
237
  async def convert_filefield_to_str(
238
- self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
238
+ self, file: models.FileField, classif_labels: list[ClassificationLabel] | None = None
239
239
  ) -> str:
240
240
  # Upload file without storing on Nuclia DB
241
241
  headers = {}
@@ -273,7 +273,7 @@ class ProcessingEngine:
273
273
  ).decode()
274
274
 
275
275
  def convert_external_filefield_to_str(
276
- self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
276
+ self, file_field: models.FileField, classif_labels: list[ClassificationLabel] | None = None
277
277
  ) -> str:
278
278
  if self.nuclia_jwt_key is None:
279
279
  raise AttributeError("Nuclia JWT key not set")
@@ -313,7 +313,7 @@ class ProcessingEngine:
313
313
  self,
314
314
  file: FieldFilePB,
315
315
  storage: Storage,
316
- classif_labels: Optional[list[ClassificationLabel]] = None,
316
+ classif_labels: list[ClassificationLabel] | None = None,
317
317
  ) -> str:
318
318
  """It's already an internal file that needs to be uploaded"""
319
319
  if self.onprem is False:
@@ -438,7 +438,7 @@ class ProcessingEngine:
438
438
  queue=QueueType(queue_type) if queue_type is not None else None,
439
439
  )
440
440
 
441
- async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
441
+ async def delete_from_processing(self, *, kbid: str, resource_id: str | None = None) -> None:
442
442
  """
443
443
  Delete a resource from processing. This prevents inflight resources from being processed
444
444
  and wasting resources.
@@ -479,7 +479,7 @@ class DummyProcessingEngine(ProcessingEngine):
479
479
  pass
480
480
 
481
481
  async def convert_filefield_to_str(
482
- self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
482
+ self, file: models.FileField, classif_labels: list[ClassificationLabel] | None = None
483
483
  ) -> str:
484
484
  self.calls.append([file])
485
485
  index = len(self.values["convert_filefield_to_str"])
@@ -487,7 +487,7 @@ class DummyProcessingEngine(ProcessingEngine):
487
487
  return f"convert_filefield_to_str,{index}"
488
488
 
489
489
  def convert_external_filefield_to_str(
490
- self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
490
+ self, file_field: models.FileField, classif_labels: list[ClassificationLabel] | None = None
491
491
  ) -> str:
492
492
  self.calls.append([file_field])
493
493
  index = len(self.values["convert_external_filefield_to_str"])
@@ -498,7 +498,7 @@ class DummyProcessingEngine(ProcessingEngine):
498
498
  self,
499
499
  file: FieldFilePB,
500
500
  storage: Storage,
501
- classif_labels: Optional[list[ClassificationLabel]] = None,
501
+ classif_labels: list[ClassificationLabel] | None = None,
502
502
  ) -> str:
503
503
  self.calls.append([file, storage])
504
504
  index = len(self.values["convert_internal_filefield_to_str"])
@@ -516,5 +516,5 @@ class DummyProcessingEngine(ProcessingEngine):
516
516
  self.values["send_to_process"].append([item, partition])
517
517
  return ProcessingInfo(seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED)
518
518
 
519
- async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
519
+ async def delete_from_processing(self, *, kbid: str, resource_id: str | None = None) -> None:
520
520
  self.calls.append([kbid, resource_id])
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional, Union
22
21
 
23
22
  import nucliadb_models as models
24
23
  from nucliadb.common import datamanagers
@@ -32,15 +31,22 @@ from nucliadb.ingest.fields.link import Link
32
31
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
33
32
  from nucliadb.ingest.orm.resource import Resource as ORMResource
34
33
  from nucliadb_models.common import FieldTypeName
34
+ from nucliadb_models.metadata import Extra, Origin, Relation
35
35
  from nucliadb_models.resource import (
36
36
  ConversationFieldData,
37
37
  ConversationFieldExtractedData,
38
38
  Error,
39
39
  ExtractedDataType,
40
40
  ExtractedDataTypeName,
41
+ ExtractedText,
42
+ FieldComputedMetadata,
43
+ FieldQuestionAnswers,
44
+ FileExtractedData,
41
45
  FileFieldData,
42
46
  FileFieldExtractedData,
43
47
  GenericFieldData,
48
+ LargeComputedMetadata,
49
+ LinkExtractedData,
44
50
  LinkFieldData,
45
51
  LinkFieldExtractedData,
46
52
  QueueType,
@@ -48,6 +54,7 @@ from nucliadb_models.resource import (
48
54
  ResourceData,
49
55
  TextFieldData,
50
56
  TextFieldExtractedData,
57
+ VectorObject,
51
58
  )
52
59
  from nucliadb_models.search import ResourceProperties
53
60
  from nucliadb_models.security import ResourceSecurity
@@ -56,84 +63,15 @@ from nucliadb_protos.writer_pb2 import FieldStatus
56
63
  from nucliadb_utils.utilities import get_storage
57
64
 
58
65
 
59
- async def set_resource_field_extracted_data(
60
- field: Field,
61
- field_data: ExtractedDataType,
62
- field_type_name: FieldTypeName,
63
- wanted_extracted_data: list[ExtractedDataTypeName],
64
- ) -> None:
65
- if field_data is None:
66
- return
67
-
68
- if ExtractedDataTypeName.TEXT in wanted_extracted_data:
69
- data_et = await field.get_extracted_text()
70
- if data_et is not None:
71
- field_data.text = from_proto.extracted_text(data_et)
72
-
73
- metadata_wanted = ExtractedDataTypeName.METADATA in wanted_extracted_data
74
- shortened_metadata_wanted = ExtractedDataTypeName.SHORTENED_METADATA in wanted_extracted_data
75
- if metadata_wanted or shortened_metadata_wanted:
76
- data_fcm = await field.get_field_metadata()
77
-
78
- if data_fcm is not None:
79
- field_data.metadata = from_proto.field_computed_metadata(
80
- data_fcm, shortened=shortened_metadata_wanted and not metadata_wanted
81
- )
82
-
83
- if ExtractedDataTypeName.LARGE_METADATA in wanted_extracted_data:
84
- data_lcm = await field.get_large_field_metadata()
85
- if data_lcm is not None:
86
- field_data.large_metadata = from_proto.large_computed_metadata(data_lcm)
87
-
88
- if ExtractedDataTypeName.VECTOR in wanted_extracted_data:
89
- # XXX: our extracted API is not vectorset-compatible, so we'll get the
90
- # first vectorset and return the values. Ideally, we should provide a
91
- # way to select a vectorset
92
- vectorset_id = None
93
- async with datamanagers.with_ro_transaction() as txn:
94
- async for vectorset_id, vs in datamanagers.vectorsets.iter(
95
- txn=txn,
96
- kbid=field.resource.kb.kbid,
97
- ):
98
- break
99
- assert vectorset_id is not None, "All KBs must have at least a vectorset"
100
- data_vec = await field.get_vectors(vectorset_id, vs.storage_key_kind)
101
- if data_vec is not None:
102
- field_data.vectors = from_proto.vector_object(data_vec)
103
-
104
- if ExtractedDataTypeName.QA in wanted_extracted_data:
105
- qa = await field.get_question_answers()
106
- if qa is not None:
107
- field_data.question_answers = from_proto.field_question_answers(qa)
108
-
109
- if (
110
- isinstance(field, File)
111
- and isinstance(field_data, FileFieldExtractedData)
112
- and ExtractedDataTypeName.FILE in wanted_extracted_data
113
- ):
114
- data_fed = await field.get_file_extracted_data()
115
- if data_fed is not None:
116
- field_data.file = from_proto.file_extracted_data(data_fed)
117
-
118
- if (
119
- isinstance(field, Link)
120
- and isinstance(field_data, LinkFieldExtractedData)
121
- and ExtractedDataTypeName.LINK in wanted_extracted_data
122
- ):
123
- data_led = await field.get_link_extracted_data()
124
- if data_led is not None:
125
- field_data.link = from_proto.link_extracted_data(data_led)
126
-
127
-
128
66
  async def serialize(
129
67
  kbid: str,
130
- rid: Optional[str],
68
+ rid: str | None,
131
69
  show: list[ResourceProperties],
132
70
  field_type_filter: list[FieldTypeName],
133
71
  extracted: list[ExtractedDataTypeName],
134
- service_name: Optional[str] = None,
135
- slug: Optional[str] = None,
136
- ) -> Optional[Resource]:
72
+ service_name: str | None = None,
73
+ slug: str | None = None,
74
+ ) -> Resource | None:
137
75
  driver = get_driver()
138
76
  async with driver.ro_transaction() as txn:
139
77
  return await managed_serialize(
@@ -148,45 +86,56 @@ async def serialize(
148
86
  )
149
87
 
150
88
 
151
- async def serialize_field_errors(
152
- field: Field,
153
- serialized: Union[
154
- TextFieldData, FileFieldData, LinkFieldData, ConversationFieldData, GenericFieldData
155
- ],
156
- ):
157
- status = await field.get_status()
158
- if status is None:
159
- status = FieldStatus()
160
- serialized.status = status.Status.Name(status.status)
161
- if status.errors:
162
- serialized.errors = []
163
- for error in status.errors:
164
- serialized.errors.append(
165
- Error(
166
- body=error.source_error.error,
167
- code=error.source_error.code,
168
- code_str=writer_pb2.Error.ErrorCode.Name(error.source_error.code),
169
- created=error.created.ToDatetime(),
170
- severity=writer_pb2.Error.Severity.Name(error.source_error.severity),
171
- )
172
- )
173
- serialized.error = serialized.errors[-1]
174
-
175
-
176
89
  async def managed_serialize(
177
90
  txn: Transaction,
178
91
  kbid: str,
179
- rid: Optional[str],
92
+ rid: str | None,
180
93
  show: list[ResourceProperties],
181
94
  field_type_filter: list[FieldTypeName],
182
95
  extracted: list[ExtractedDataTypeName],
183
- service_name: Optional[str] = None,
184
- slug: Optional[str] = None,
185
- ) -> Optional[Resource]:
96
+ service_name: str | None = None,
97
+ slug: str | None = None,
98
+ ) -> Resource | None:
186
99
  orm_resource = await get_orm_resource(txn, kbid, rid=rid, slug=slug, service_name=service_name)
187
100
  if orm_resource is None:
188
101
  return None
189
102
 
103
+ return await serialize_resource(orm_resource, show, field_type_filter, extracted)
104
+
105
+
106
+ async def get_orm_resource(
107
+ txn: Transaction,
108
+ kbid: str,
109
+ rid: str | None,
110
+ slug: str | None = None,
111
+ service_name: str | None = None,
112
+ ) -> ORMResource | None:
113
+ storage = await get_storage(service_name=service_name)
114
+
115
+ kb = KnowledgeBox(txn, storage, kbid)
116
+
117
+ if rid is None:
118
+ if slug is None:
119
+ raise ValueError("Either rid or slug parameters should be used")
120
+
121
+ rid = await kb.get_resource_uuid_by_slug(slug)
122
+ if rid is None:
123
+ # Could not find resource uuid from slug
124
+ return None
125
+
126
+ orm_resource = await kb.get(rid)
127
+ if orm_resource is None:
128
+ return None
129
+
130
+ return orm_resource
131
+
132
+
133
+ async def serialize_resource(
134
+ orm_resource: ORMResource,
135
+ show: list[ResourceProperties],
136
+ field_type_filter: list[FieldTypeName],
137
+ extracted: list[ExtractedDataTypeName],
138
+ ) -> Resource:
190
139
  resource = Resource(id=orm_resource.uuid)
191
140
 
192
141
  include_values = ResourceProperties.VALUES in show
@@ -230,29 +179,18 @@ async def managed_serialize(
230
179
  resource.queue = QueueType[orm_resource.basic.QueueType.Name(orm_resource.basic.queue)]
231
180
 
232
181
  if ResourceProperties.RELATIONS in show:
233
- relations = await orm_resource.get_user_relations()
234
- resource.usermetadata.relations = [
235
- from_proto.relation(rel) for rel in relations.relations
236
- ]
182
+ resource.usermetadata.relations = await serialize_user_relations(orm_resource)
237
183
 
238
184
  if ResourceProperties.ORIGIN in show:
239
- await orm_resource.get_origin()
240
- if orm_resource.origin is not None:
241
- resource.origin = from_proto.origin(orm_resource.origin)
185
+ resource.origin = await serialize_origin(orm_resource)
242
186
 
243
187
  if ResourceProperties.EXTRA in show:
244
- await orm_resource.get_extra()
245
- if orm_resource.extra is not None:
246
- resource.extra = from_proto.extra(orm_resource.extra)
188
+ resource.extra = await serialize_extra(orm_resource)
247
189
 
248
190
  include_errors = ResourceProperties.ERRORS in show
249
191
 
250
192
  if ResourceProperties.SECURITY in show:
251
- await orm_resource.get_security()
252
- resource.security = ResourceSecurity(access_groups=[])
253
- if orm_resource.security is not None:
254
- for gid in orm_resource.security.access_groups:
255
- resource.security.access_groups.append(gid)
193
+ resource.security = await serialize_security(orm_resource)
256
194
 
257
195
  if (field_type_filter and (include_values or include_extracted_data)) or include_errors:
258
196
  await orm_resource.get_fields()
@@ -360,38 +298,157 @@ async def managed_serialize(
360
298
  return resource
361
299
 
362
300
 
363
- async def get_orm_resource(
364
- txn: Transaction,
365
- kbid: str,
366
- rid: Optional[str],
367
- slug: Optional[str] = None,
368
- service_name: Optional[str] = None,
369
- ) -> Optional[ORMResource]:
370
- storage = await get_storage(service_name=service_name)
301
+ async def serialize_origin(resource: ORMResource) -> Origin | None:
302
+ origin = await resource.get_origin()
303
+ if origin is None:
304
+ return None
371
305
 
372
- kb = KnowledgeBox(txn, storage, kbid)
306
+ return from_proto.origin(origin)
373
307
 
374
- if rid is None:
375
- if slug is None:
376
- raise ValueError("Either rid or slug parameters should be used")
377
308
 
378
- rid = await kb.get_resource_uuid_by_slug(slug)
379
- if rid is None:
380
- # Could not find resource uuid from slug
381
- return None
309
+ async def serialize_extra(resource: ORMResource) -> Extra | None:
310
+ extra = await resource.get_extra()
311
+ if extra is None:
312
+ return None
313
+ return from_proto.extra(extra)
382
314
 
383
- orm_resource = await kb.get(rid)
384
- if orm_resource is None:
315
+
316
+ async def serialize_user_relations(resource: ORMResource) -> list[Relation]:
317
+ relations = await resource.get_user_relations()
318
+ return [from_proto.relation(rel) for rel in relations.relations]
319
+
320
+
321
+ async def serialize_security(resource: ORMResource) -> ResourceSecurity:
322
+ security = ResourceSecurity(access_groups=[])
323
+
324
+ security_pb = await resource.get_security()
325
+ if security_pb is not None:
326
+ for gid in security_pb.access_groups:
327
+ security.access_groups.append(gid)
328
+
329
+ return security
330
+
331
+
332
+ async def serialize_field_errors(
333
+ field: Field,
334
+ serialized: (
335
+ TextFieldData | FileFieldData | LinkFieldData | ConversationFieldData | GenericFieldData
336
+ ),
337
+ ):
338
+ status = await field.get_status()
339
+ if status is None:
340
+ status = FieldStatus()
341
+ serialized.status = status.Status.Name(status.status)
342
+ if status.errors:
343
+ serialized.errors = []
344
+ for error in status.errors:
345
+ serialized.errors.append(
346
+ Error(
347
+ body=error.source_error.error,
348
+ code=error.source_error.code,
349
+ code_str=writer_pb2.Error.ErrorCode.Name(error.source_error.code),
350
+ created=error.created.ToDatetime(),
351
+ severity=writer_pb2.Error.Severity.Name(error.source_error.severity),
352
+ )
353
+ )
354
+ serialized.error = serialized.errors[-1]
355
+
356
+
357
+ async def set_resource_field_extracted_data(
358
+ field: Field,
359
+ field_data: ExtractedDataType,
360
+ field_type_name: FieldTypeName,
361
+ wanted_extracted_data: list[ExtractedDataTypeName],
362
+ ) -> None:
363
+ if field_data is None:
364
+ return
365
+
366
+ if ExtractedDataTypeName.TEXT in wanted_extracted_data:
367
+ field_data.text = await serialize_extracted_text(field)
368
+
369
+ metadata_wanted = ExtractedDataTypeName.METADATA in wanted_extracted_data
370
+ shortened_metadata_wanted = ExtractedDataTypeName.SHORTENED_METADATA in wanted_extracted_data
371
+ if metadata_wanted or shortened_metadata_wanted:
372
+ field_data.metadata = await serialize_extracted_metadata(
373
+ field, shortened=shortened_metadata_wanted and not metadata_wanted
374
+ )
375
+
376
+ if ExtractedDataTypeName.LARGE_METADATA in wanted_extracted_data:
377
+ field_data.large_metadata = await serialize_extracted_large_metadata(field)
378
+
379
+ if ExtractedDataTypeName.VECTOR in wanted_extracted_data:
380
+ field_data.vectors = await serialize_extracted_vectors(field)
381
+
382
+ if ExtractedDataTypeName.QA in wanted_extracted_data:
383
+ field_data.question_answers = await serialize_extracted_question_answers(field)
384
+
385
+ if (
386
+ isinstance(field, File)
387
+ and isinstance(field_data, FileFieldExtractedData)
388
+ and ExtractedDataTypeName.FILE in wanted_extracted_data
389
+ ):
390
+ field_data.file = await serialize_file_extracted_data(field)
391
+
392
+ if (
393
+ isinstance(field, Link)
394
+ and isinstance(field_data, LinkFieldExtractedData)
395
+ and ExtractedDataTypeName.LINK in wanted_extracted_data
396
+ ):
397
+ field_data.link = await serialize_link_extracted_data(field)
398
+
399
+
400
+ async def serialize_extracted_text(field: Field) -> ExtractedText | None:
401
+ data_et = await field.get_extracted_text()
402
+ if data_et is None:
385
403
  return None
404
+ return from_proto.extracted_text(data_et)
386
405
 
387
- return orm_resource
388
406
 
407
+ async def serialize_extracted_metadata(field: Field, *, shortened: bool) -> FieldComputedMetadata | None:
408
+ data_fcm = await field.get_field_metadata()
409
+ if data_fcm is None:
410
+ return None
411
+ return from_proto.field_computed_metadata(data_fcm, shortened)
389
412
 
390
- async def get_resource_uuid_by_slug(
391
- kbid: str, slug: str, service_name: Optional[str] = None
392
- ) -> Optional[str]:
393
- storage = await get_storage(service_name=service_name)
394
- driver = get_driver()
395
- async with driver.ro_transaction() as txn:
396
- kb = KnowledgeBox(txn, storage, kbid)
397
- return await kb.get_resource_uuid_by_slug(slug)
413
+
414
+ async def serialize_extracted_large_metadata(field: Field) -> LargeComputedMetadata | None:
415
+ data_lcm = await field.get_large_field_metadata()
416
+ if data_lcm is None:
417
+ return None
418
+ return from_proto.large_computed_metadata(data_lcm)
419
+
420
+
421
+ async def serialize_extracted_vectors(field: Field) -> VectorObject | None:
422
+ # XXX: our extracted API is not vectorset-compatible, so we'll get the
423
+ # first vectorset and return the values. Ideally, we should provide a
424
+ # way to select a vectorset
425
+ vectorset_id = None
426
+ async with datamanagers.with_ro_transaction() as txn:
427
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(txn=txn, kbid=field.kbid):
428
+ break
429
+ assert vectorset_id is not None, "All KBs must have at least a vectorset"
430
+ data_vec = await field.get_vectors(vectorset_id, vs.storage_key_kind)
431
+ if data_vec is None:
432
+ return None
433
+ return from_proto.vector_object(data_vec)
434
+
435
+
436
+ async def serialize_extracted_question_answers(field: Field) -> FieldQuestionAnswers | None:
437
+ qa = await field.get_question_answers()
438
+ if qa is None:
439
+ return None
440
+ return from_proto.field_question_answers(qa)
441
+
442
+
443
+ async def serialize_file_extracted_data(field: File) -> FileExtractedData | None:
444
+ data_fed = await field.get_file_extracted_data()
445
+ if data_fed is None:
446
+ return None
447
+ return from_proto.file_extracted_data(data_fed)
448
+
449
+
450
+ async def serialize_link_extracted_data(field: Link) -> LinkExtractedData | None:
451
+ data_led = await field.get_link_extracted_data()
452
+ if data_led is None:
453
+ return None
454
+ return from_proto.link_extracted_data(data_led)