nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional, Union
22
21
 
23
22
  import nucliadb_models as models
24
23
  from nucliadb.common import datamanagers
@@ -32,15 +31,22 @@ from nucliadb.ingest.fields.link import Link
32
31
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
33
32
  from nucliadb.ingest.orm.resource import Resource as ORMResource
34
33
  from nucliadb_models.common import FieldTypeName
34
+ from nucliadb_models.metadata import Extra, Origin, Relation
35
35
  from nucliadb_models.resource import (
36
36
  ConversationFieldData,
37
37
  ConversationFieldExtractedData,
38
38
  Error,
39
39
  ExtractedDataType,
40
40
  ExtractedDataTypeName,
41
+ ExtractedText,
42
+ FieldComputedMetadata,
43
+ FieldQuestionAnswers,
44
+ FileExtractedData,
41
45
  FileFieldData,
42
46
  FileFieldExtractedData,
43
47
  GenericFieldData,
48
+ LargeComputedMetadata,
49
+ LinkExtractedData,
44
50
  LinkFieldData,
45
51
  LinkFieldExtractedData,
46
52
  QueueType,
@@ -48,6 +54,7 @@ from nucliadb_models.resource import (
48
54
  ResourceData,
49
55
  TextFieldData,
50
56
  TextFieldExtractedData,
57
+ VectorObject,
51
58
  )
52
59
  from nucliadb_models.search import ResourceProperties
53
60
  from nucliadb_models.security import ResourceSecurity
@@ -56,84 +63,15 @@ from nucliadb_protos.writer_pb2 import FieldStatus
56
63
  from nucliadb_utils.utilities import get_storage
57
64
 
58
65
 
59
- async def set_resource_field_extracted_data(
60
- field: Field,
61
- field_data: ExtractedDataType,
62
- field_type_name: FieldTypeName,
63
- wanted_extracted_data: list[ExtractedDataTypeName],
64
- ) -> None:
65
- if field_data is None:
66
- return
67
-
68
- if ExtractedDataTypeName.TEXT in wanted_extracted_data:
69
- data_et = await field.get_extracted_text()
70
- if data_et is not None:
71
- field_data.text = from_proto.extracted_text(data_et)
72
-
73
- metadata_wanted = ExtractedDataTypeName.METADATA in wanted_extracted_data
74
- shortened_metadata_wanted = ExtractedDataTypeName.SHORTENED_METADATA in wanted_extracted_data
75
- if metadata_wanted or shortened_metadata_wanted:
76
- data_fcm = await field.get_field_metadata()
77
-
78
- if data_fcm is not None:
79
- field_data.metadata = from_proto.field_computed_metadata(
80
- data_fcm, shortened=shortened_metadata_wanted and not metadata_wanted
81
- )
82
-
83
- if ExtractedDataTypeName.LARGE_METADATA in wanted_extracted_data:
84
- data_lcm = await field.get_large_field_metadata()
85
- if data_lcm is not None:
86
- field_data.large_metadata = from_proto.large_computed_metadata(data_lcm)
87
-
88
- if ExtractedDataTypeName.VECTOR in wanted_extracted_data:
89
- # XXX: our extracted API is not vectorset-compatible, so we'll get the
90
- # first vectorset and return the values. Ideally, we should provide a
91
- # way to select a vectorset
92
- vectorset_id = None
93
- async with datamanagers.with_ro_transaction() as txn:
94
- async for vectorset_id, vs in datamanagers.vectorsets.iter(
95
- txn=txn,
96
- kbid=field.resource.kb.kbid,
97
- ):
98
- break
99
- assert vectorset_id is not None, "All KBs must have at least a vectorset"
100
- data_vec = await field.get_vectors(vectorset_id, vs.storage_key_kind)
101
- if data_vec is not None:
102
- field_data.vectors = from_proto.vector_object(data_vec)
103
-
104
- if ExtractedDataTypeName.QA in wanted_extracted_data:
105
- qa = await field.get_question_answers()
106
- if qa is not None:
107
- field_data.question_answers = from_proto.field_question_answers(qa)
108
-
109
- if (
110
- isinstance(field, File)
111
- and isinstance(field_data, FileFieldExtractedData)
112
- and ExtractedDataTypeName.FILE in wanted_extracted_data
113
- ):
114
- data_fed = await field.get_file_extracted_data()
115
- if data_fed is not None:
116
- field_data.file = from_proto.file_extracted_data(data_fed)
117
-
118
- if (
119
- isinstance(field, Link)
120
- and isinstance(field_data, LinkFieldExtractedData)
121
- and ExtractedDataTypeName.LINK in wanted_extracted_data
122
- ):
123
- data_led = await field.get_link_extracted_data()
124
- if data_led is not None:
125
- field_data.link = from_proto.link_extracted_data(data_led)
126
-
127
-
128
66
  async def serialize(
129
67
  kbid: str,
130
- rid: Optional[str],
68
+ rid: str | None,
131
69
  show: list[ResourceProperties],
132
70
  field_type_filter: list[FieldTypeName],
133
71
  extracted: list[ExtractedDataTypeName],
134
- service_name: Optional[str] = None,
135
- slug: Optional[str] = None,
136
- ) -> Optional[Resource]:
72
+ service_name: str | None = None,
73
+ slug: str | None = None,
74
+ ) -> Resource | None:
137
75
  driver = get_driver()
138
76
  async with driver.ro_transaction() as txn:
139
77
  return await managed_serialize(
@@ -148,45 +86,56 @@ async def serialize(
148
86
  )
149
87
 
150
88
 
151
- async def serialize_field_errors(
152
- field: Field,
153
- serialized: Union[
154
- TextFieldData, FileFieldData, LinkFieldData, ConversationFieldData, GenericFieldData
155
- ],
156
- ):
157
- status = await field.get_status()
158
- if status is None:
159
- status = FieldStatus()
160
- serialized.status = status.Status.Name(status.status)
161
- if status.errors:
162
- serialized.errors = []
163
- for error in status.errors:
164
- serialized.errors.append(
165
- Error(
166
- body=error.source_error.error,
167
- code=error.source_error.code,
168
- code_str=writer_pb2.Error.ErrorCode.Name(error.source_error.code),
169
- created=error.created.ToDatetime(),
170
- severity=writer_pb2.Error.Severity.Name(error.source_error.severity),
171
- )
172
- )
173
- serialized.error = serialized.errors[-1]
174
-
175
-
176
89
  async def managed_serialize(
177
90
  txn: Transaction,
178
91
  kbid: str,
179
- rid: Optional[str],
92
+ rid: str | None,
180
93
  show: list[ResourceProperties],
181
94
  field_type_filter: list[FieldTypeName],
182
95
  extracted: list[ExtractedDataTypeName],
183
- service_name: Optional[str] = None,
184
- slug: Optional[str] = None,
185
- ) -> Optional[Resource]:
96
+ service_name: str | None = None,
97
+ slug: str | None = None,
98
+ ) -> Resource | None:
186
99
  orm_resource = await get_orm_resource(txn, kbid, rid=rid, slug=slug, service_name=service_name)
187
100
  if orm_resource is None:
188
101
  return None
189
102
 
103
+ return await serialize_resource(orm_resource, show, field_type_filter, extracted)
104
+
105
+
106
+ async def get_orm_resource(
107
+ txn: Transaction,
108
+ kbid: str,
109
+ rid: str | None,
110
+ slug: str | None = None,
111
+ service_name: str | None = None,
112
+ ) -> ORMResource | None:
113
+ storage = await get_storage(service_name=service_name)
114
+
115
+ kb = KnowledgeBox(txn, storage, kbid)
116
+
117
+ if rid is None:
118
+ if slug is None:
119
+ raise ValueError("Either rid or slug parameters should be used")
120
+
121
+ rid = await kb.get_resource_uuid_by_slug(slug)
122
+ if rid is None:
123
+ # Could not find resource uuid from slug
124
+ return None
125
+
126
+ orm_resource = await kb.get(rid)
127
+ if orm_resource is None:
128
+ return None
129
+
130
+ return orm_resource
131
+
132
+
133
+ async def serialize_resource(
134
+ orm_resource: ORMResource,
135
+ show: list[ResourceProperties],
136
+ field_type_filter: list[FieldTypeName],
137
+ extracted: list[ExtractedDataTypeName],
138
+ ) -> Resource:
190
139
  resource = Resource(id=orm_resource.uuid)
191
140
 
192
141
  include_values = ResourceProperties.VALUES in show
@@ -230,29 +179,18 @@ async def managed_serialize(
230
179
  resource.queue = QueueType[orm_resource.basic.QueueType.Name(orm_resource.basic.queue)]
231
180
 
232
181
  if ResourceProperties.RELATIONS in show:
233
- relations = await orm_resource.get_user_relations()
234
- resource.usermetadata.relations = [
235
- from_proto.relation(rel) for rel in relations.relations
236
- ]
182
+ resource.usermetadata.relations = await serialize_user_relations(orm_resource)
237
183
 
238
184
  if ResourceProperties.ORIGIN in show:
239
- await orm_resource.get_origin()
240
- if orm_resource.origin is not None:
241
- resource.origin = from_proto.origin(orm_resource.origin)
185
+ resource.origin = await serialize_origin(orm_resource)
242
186
 
243
187
  if ResourceProperties.EXTRA in show:
244
- await orm_resource.get_extra()
245
- if orm_resource.extra is not None:
246
- resource.extra = from_proto.extra(orm_resource.extra)
188
+ resource.extra = await serialize_extra(orm_resource)
247
189
 
248
190
  include_errors = ResourceProperties.ERRORS in show
249
191
 
250
192
  if ResourceProperties.SECURITY in show:
251
- await orm_resource.get_security()
252
- resource.security = ResourceSecurity(access_groups=[])
253
- if orm_resource.security is not None:
254
- for gid in orm_resource.security.access_groups:
255
- resource.security.access_groups.append(gid)
193
+ resource.security = await serialize_security(orm_resource)
256
194
 
257
195
  if (field_type_filter and (include_values or include_extracted_data)) or include_errors:
258
196
  await orm_resource.get_fields()
@@ -360,38 +298,157 @@ async def managed_serialize(
360
298
  return resource
361
299
 
362
300
 
363
- async def get_orm_resource(
364
- txn: Transaction,
365
- kbid: str,
366
- rid: Optional[str],
367
- slug: Optional[str] = None,
368
- service_name: Optional[str] = None,
369
- ) -> Optional[ORMResource]:
370
- storage = await get_storage(service_name=service_name)
301
+ async def serialize_origin(resource: ORMResource) -> Origin | None:
302
+ origin = await resource.get_origin()
303
+ if origin is None:
304
+ return None
371
305
 
372
- kb = KnowledgeBox(txn, storage, kbid)
306
+ return from_proto.origin(origin)
373
307
 
374
- if rid is None:
375
- if slug is None:
376
- raise ValueError("Either rid or slug parameters should be used")
377
308
 
378
- rid = await kb.get_resource_uuid_by_slug(slug)
379
- if rid is None:
380
- # Could not find resource uuid from slug
381
- return None
309
+ async def serialize_extra(resource: ORMResource) -> Extra | None:
310
+ extra = await resource.get_extra()
311
+ if extra is None:
312
+ return None
313
+ return from_proto.extra(extra)
382
314
 
383
- orm_resource = await kb.get(rid)
384
- if orm_resource is None:
315
+
316
+ async def serialize_user_relations(resource: ORMResource) -> list[Relation]:
317
+ relations = await resource.get_user_relations()
318
+ return [from_proto.relation(rel) for rel in relations.relations]
319
+
320
+
321
+ async def serialize_security(resource: ORMResource) -> ResourceSecurity:
322
+ security = ResourceSecurity(access_groups=[])
323
+
324
+ security_pb = await resource.get_security()
325
+ if security_pb is not None:
326
+ for gid in security_pb.access_groups:
327
+ security.access_groups.append(gid)
328
+
329
+ return security
330
+
331
+
332
+ async def serialize_field_errors(
333
+ field: Field,
334
+ serialized: (
335
+ TextFieldData | FileFieldData | LinkFieldData | ConversationFieldData | GenericFieldData
336
+ ),
337
+ ):
338
+ status = await field.get_status()
339
+ if status is None:
340
+ status = FieldStatus()
341
+ serialized.status = status.Status.Name(status.status)
342
+ if status.errors:
343
+ serialized.errors = []
344
+ for error in status.errors:
345
+ serialized.errors.append(
346
+ Error(
347
+ body=error.source_error.error,
348
+ code=error.source_error.code,
349
+ code_str=writer_pb2.Error.ErrorCode.Name(error.source_error.code),
350
+ created=error.created.ToDatetime(),
351
+ severity=writer_pb2.Error.Severity.Name(error.source_error.severity),
352
+ )
353
+ )
354
+ serialized.error = serialized.errors[-1]
355
+
356
+
357
+ async def set_resource_field_extracted_data(
358
+ field: Field,
359
+ field_data: ExtractedDataType,
360
+ field_type_name: FieldTypeName,
361
+ wanted_extracted_data: list[ExtractedDataTypeName],
362
+ ) -> None:
363
+ if field_data is None:
364
+ return
365
+
366
+ if ExtractedDataTypeName.TEXT in wanted_extracted_data:
367
+ field_data.text = await serialize_extracted_text(field)
368
+
369
+ metadata_wanted = ExtractedDataTypeName.METADATA in wanted_extracted_data
370
+ shortened_metadata_wanted = ExtractedDataTypeName.SHORTENED_METADATA in wanted_extracted_data
371
+ if metadata_wanted or shortened_metadata_wanted:
372
+ field_data.metadata = await serialize_extracted_metadata(
373
+ field, shortened=shortened_metadata_wanted and not metadata_wanted
374
+ )
375
+
376
+ if ExtractedDataTypeName.LARGE_METADATA in wanted_extracted_data:
377
+ field_data.large_metadata = await serialize_extracted_large_metadata(field)
378
+
379
+ if ExtractedDataTypeName.VECTOR in wanted_extracted_data:
380
+ field_data.vectors = await serialize_extracted_vectors(field)
381
+
382
+ if ExtractedDataTypeName.QA in wanted_extracted_data:
383
+ field_data.question_answers = await serialize_extracted_question_answers(field)
384
+
385
+ if (
386
+ isinstance(field, File)
387
+ and isinstance(field_data, FileFieldExtractedData)
388
+ and ExtractedDataTypeName.FILE in wanted_extracted_data
389
+ ):
390
+ field_data.file = await serialize_file_extracted_data(field)
391
+
392
+ if (
393
+ isinstance(field, Link)
394
+ and isinstance(field_data, LinkFieldExtractedData)
395
+ and ExtractedDataTypeName.LINK in wanted_extracted_data
396
+ ):
397
+ field_data.link = await serialize_link_extracted_data(field)
398
+
399
+
400
+ async def serialize_extracted_text(field: Field) -> ExtractedText | None:
401
+ data_et = await field.get_extracted_text()
402
+ if data_et is None:
385
403
  return None
404
+ return from_proto.extracted_text(data_et)
386
405
 
387
- return orm_resource
388
406
 
407
+ async def serialize_extracted_metadata(field: Field, *, shortened: bool) -> FieldComputedMetadata | None:
408
+ data_fcm = await field.get_field_metadata()
409
+ if data_fcm is None:
410
+ return None
411
+ return from_proto.field_computed_metadata(data_fcm, shortened)
389
412
 
390
- async def get_resource_uuid_by_slug(
391
- kbid: str, slug: str, service_name: Optional[str] = None
392
- ) -> Optional[str]:
393
- storage = await get_storage(service_name=service_name)
394
- driver = get_driver()
395
- async with driver.ro_transaction() as txn:
396
- kb = KnowledgeBox(txn, storage, kbid)
397
- return await kb.get_resource_uuid_by_slug(slug)
413
+
414
+ async def serialize_extracted_large_metadata(field: Field) -> LargeComputedMetadata | None:
415
+ data_lcm = await field.get_large_field_metadata()
416
+ if data_lcm is None:
417
+ return None
418
+ return from_proto.large_computed_metadata(data_lcm)
419
+
420
+
421
+ async def serialize_extracted_vectors(field: Field) -> VectorObject | None:
422
+ # XXX: our extracted API is not vectorset-compatible, so we'll get the
423
+ # first vectorset and return the values. Ideally, we should provide a
424
+ # way to select a vectorset
425
+ vectorset_id = None
426
+ async with datamanagers.with_ro_transaction() as txn:
427
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(txn=txn, kbid=field.kbid):
428
+ break
429
+ assert vectorset_id is not None, "All KBs must have at least a vectorset"
430
+ data_vec = await field.get_vectors(vectorset_id, vs.storage_key_kind)
431
+ if data_vec is None:
432
+ return None
433
+ return from_proto.vector_object(data_vec)
434
+
435
+
436
+ async def serialize_extracted_question_answers(field: Field) -> FieldQuestionAnswers | None:
437
+ qa = await field.get_question_answers()
438
+ if qa is None:
439
+ return None
440
+ return from_proto.field_question_answers(qa)
441
+
442
+
443
+ async def serialize_file_extracted_data(field: File) -> FileExtractedData | None:
444
+ data_fed = await field.get_file_extracted_data()
445
+ if data_fed is None:
446
+ return None
447
+ return from_proto.file_extracted_data(data_fed)
448
+
449
+
450
+ async def serialize_link_extracted_data(field: Link) -> LinkExtractedData | None:
451
+ data_led = await field.get_link_extracted_data()
452
+ if data_led is None:
453
+ return None
454
+ return from_proto.link_extracted_data(data_led)
@@ -18,12 +18,11 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import uuid
21
- from typing import AsyncIterator
21
+ from collections.abc import AsyncIterator
22
22
 
23
23
  from nucliadb.backups import tasks as backup_tasks
24
24
  from nucliadb.backups import utils as backup_utils
25
25
  from nucliadb.common import datamanagers
26
- from nucliadb.common.cluster.exceptions import AlreadyExists, EntitiesGroupNotFound
27
26
  from nucliadb.common.cluster.utils import get_shard_manager
28
27
  from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
29
28
  from nucliadb.common.external_index_providers.exceptions import ExternalIndexCreationError
@@ -49,7 +48,6 @@ from nucliadb_protos.knowledgebox_pb2 import (
49
48
  )
50
49
  from nucliadb_protos.writer_pb2 import (
51
50
  BrokerMessage,
52
- DelEntitiesRequest,
53
51
  GetEntitiesGroupRequest,
54
52
  GetEntitiesGroupResponse,
55
53
  GetEntitiesRequest,
@@ -58,12 +56,7 @@ from nucliadb_protos.writer_pb2 import (
58
56
  IndexStatus,
59
57
  ListEntitiesGroupsRequest,
60
58
  ListEntitiesGroupsResponse,
61
- NewEntitiesGroupRequest,
62
- NewEntitiesGroupResponse,
63
59
  OpStatusWriter,
64
- SetEntitiesRequest,
65
- UpdateEntitiesGroupRequest,
66
- UpdateEntitiesGroupResponse,
67
60
  WriterStatusRequest,
68
61
  WriterStatusResponse,
69
62
  )
@@ -126,6 +119,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
126
119
  external_index_provider=request.external_index_provider,
127
120
  hidden_resources_enabled=request.hidden_resources_enabled,
128
121
  hidden_resources_hide_on_creation=request.hidden_resources_hide_on_creation,
122
+ prewarm_enabled=request.prewarm_enabled,
129
123
  )
130
124
 
131
125
  except KnowledgeBoxConflict:
@@ -167,11 +161,17 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
167
161
  )
168
162
 
169
163
  try:
170
- async with self.driver.rw_transaction() as txn:
171
- kbid = await KnowledgeBoxORM.update(
172
- txn, uuid=request.uuid, slug=request.slug, config=request.config
173
- )
174
- await txn.commit()
164
+ kbid = await KnowledgeBoxORM.update(
165
+ self.driver,
166
+ kbid=request.uuid,
167
+ slug=request.slug,
168
+ title=request.config.title or None,
169
+ description=request.config.description or None,
170
+ external_index_provider=request.config.external_index_provider or None,
171
+ hidden_resources_enabled=request.config.hidden_resources_enabled,
172
+ hidden_resources_hide_on_creation=request.config.hidden_resources_hide_on_creation,
173
+ prewarm_enabled=request.config.prewarm_enabled,
174
+ )
175
175
  except KnowledgeBoxNotFound:
176
176
  return UpdateKnowledgeBoxResponse(status=KnowledgeBoxResponseStatus.NOTFOUND)
177
177
  except Exception:
@@ -217,29 +217,6 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
217
217
  logger.info(f"Processed {message.uuid}")
218
218
  return response
219
219
 
220
- async def NewEntitiesGroup( # type: ignore
221
- self, request: NewEntitiesGroupRequest, context=None
222
- ) -> NewEntitiesGroupResponse:
223
- response = NewEntitiesGroupResponse()
224
- async with self.driver.ro_transaction() as ro_txn:
225
- kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
226
- if kbobj is None:
227
- response.status = NewEntitiesGroupResponse.Status.KB_NOT_FOUND
228
- return response
229
-
230
- async with self.driver.rw_transaction() as txn:
231
- kbobj.txn = txn
232
- entities_manager = EntitiesManager(kbobj, txn)
233
- try:
234
- await entities_manager.create_entities_group(request.group, request.entities)
235
- except AlreadyExists:
236
- response.status = NewEntitiesGroupResponse.Status.ALREADY_EXISTS
237
- return response
238
-
239
- await txn.commit()
240
- response.status = NewEntitiesGroupResponse.Status.OK
241
- return response
242
-
243
220
  async def GetEntities( # type: ignore
244
221
  self, request: GetEntitiesRequest, context=None
245
222
  ) -> GetEntitiesResponse:
@@ -313,81 +290,6 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
313
290
 
314
291
  return response
315
292
 
316
- async def SetEntities(self, request: SetEntitiesRequest, context=None) -> OpStatusWriter: # type: ignore
317
- response = OpStatusWriter()
318
- async with self.driver.ro_transaction() as ro_txn:
319
- kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
320
- if kbobj is None:
321
- response.status = OpStatusWriter.Status.NOTFOUND
322
- return response
323
-
324
- async with self.driver.rw_transaction() as txn:
325
- kbobj.txn = txn
326
- entities_manager = EntitiesManager(kbobj, txn)
327
- try:
328
- await entities_manager.set_entities_group(request.group, request.entities)
329
- except Exception as e:
330
- errors.capture_exception(e)
331
- logger.error("Error in ingest gRPC servicer", exc_info=True)
332
- response.status = OpStatusWriter.Status.ERROR
333
- else:
334
- response.status = OpStatusWriter.Status.OK
335
- await txn.commit()
336
- return response
337
-
338
- async def UpdateEntitiesGroup( # type: ignore
339
- self, request: UpdateEntitiesGroupRequest, context=None
340
- ) -> UpdateEntitiesGroupResponse:
341
- response = UpdateEntitiesGroupResponse()
342
- async with self.driver.ro_transaction() as ro_txn:
343
- kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
344
- if kbobj is None:
345
- response.status = UpdateEntitiesGroupResponse.Status.KB_NOT_FOUND
346
- return response
347
-
348
- async with self.driver.rw_transaction() as txn:
349
- kbobj.txn = txn
350
- entities_manager = EntitiesManager(kbobj, txn)
351
- try:
352
- await entities_manager.set_entities_group_metadata(
353
- request.group,
354
- title=request.title,
355
- color=request.color,
356
- )
357
- updates = {**request.add, **request.update}
358
- await entities_manager.update_entities(request.group, updates)
359
- await entities_manager.delete_entities(request.group, request.delete) # type: ignore
360
- except EntitiesGroupNotFound:
361
- response.status = UpdateEntitiesGroupResponse.Status.ENTITIES_GROUP_NOT_FOUND
362
- return response
363
-
364
- await txn.commit()
365
- response.status = UpdateEntitiesGroupResponse.Status.OK
366
- return response
367
-
368
- async def DelEntities(self, request: DelEntitiesRequest, context=None) -> OpStatusWriter: # type: ignore
369
- response = OpStatusWriter()
370
-
371
- async with self.driver.ro_transaction() as ro_txn:
372
- kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
373
- if kbobj is None:
374
- response.status = OpStatusWriter.Status.NOTFOUND
375
- return response
376
-
377
- async with self.driver.rw_transaction() as txn:
378
- kbobj.txn = txn
379
- entities_manager = EntitiesManager(kbobj, txn)
380
- try:
381
- await entities_manager.delete_entities_group(request.group)
382
- except Exception as e:
383
- errors.capture_exception(e)
384
- logger.error("Error in ingest gRPC servicer", exc_info=True)
385
- response.status = OpStatusWriter.Status.ERROR
386
- else:
387
- await txn.commit()
388
- response.status = OpStatusWriter.Status.OK
389
- return response
390
-
391
293
  async def Status( # type: ignore
392
294
  self, request: WriterStatusRequest, context=None
393
295
  ) -> WriterStatusResponse:
@@ -406,8 +308,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
406
308
 
407
309
  async def Index(self, request: IndexResource, context=None) -> IndexStatus: # type: ignore
408
310
  async with self.driver.ro_transaction() as txn:
409
- kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
410
- resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
311
+ resobj = ResourceORM(txn, self.storage, request.kbid, request.rid)
411
312
  bm = await generate_broker_message(resobj)
412
313
  transaction = get_transaction_utility()
413
314
  partitioning = get_partitioning()
@@ -421,7 +322,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
421
322
  try:
422
323
  async with self.driver.rw_transaction() as txn:
423
324
  kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
424
- resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
325
+ resobj = ResourceORM(txn, self.storage, request.kbid, request.rid)
425
326
  resobj.disable_vectors = not request.reindex_vectors
426
327
  index_message = await get_resource_index_message(resobj, reindex=True)
427
328
  shard = await self.proc.get_or_assign_resource_shard(txn, kbobj, request.rid)