nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/common/ids.py CHANGED
@@ -24,7 +24,6 @@ paragraphs... Avoiding spread of id construction and parsing everywhere
24
24
  """
25
25
 
26
26
  from dataclasses import dataclass
27
- from typing import Optional
28
27
 
29
28
  from nucliadb_models.common import FieldTypeName
30
29
  from nucliadb_protos.resources_pb2 import FieldType
@@ -47,6 +46,8 @@ FIELD_TYPE_NAME_TO_STR = {
47
46
  FieldTypeName.CONVERSATION: "c",
48
47
  }
49
48
 
49
+ FIELD_TYPE_STR_TO_NAME = {v: k for k, v in FIELD_TYPE_NAME_TO_STR.items()}
50
+
50
51
 
51
52
  @dataclass
52
53
  class FieldId:
@@ -65,7 +66,7 @@ class FieldId:
65
66
 
66
67
  Examples:
67
68
 
68
- >>> FieldId(rid="rid", type="u", key="/my-link")
69
+ >>> FieldId(rid="rid", type="u", key="my-link")
69
70
  FieldID("rid/u/my-link")
70
71
  >>> FieldId.from_string("rid/u/my-link")
71
72
  FieldID("rid/u/my-link")
@@ -75,32 +76,7 @@ class FieldId:
75
76
  type: str
76
77
  key: str
77
78
  # also knwon as `split`, this indicates a part of a field in, for example, conversations
78
- subfield_id: Optional[str] = None
79
-
80
- def __repr__(self) -> str:
81
- return f"FieldId({self.full()})"
82
-
83
- def short_without_subfield(self) -> str:
84
- return f"/{self.type}/{self.key}"
85
-
86
- def full(self) -> str:
87
- if self.subfield_id is None:
88
- return f"{self.rid}/{self.type}/{self.key}"
89
- else:
90
- return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
91
-
92
- def __hash__(self) -> int:
93
- return hash(self.full())
94
-
95
- @property
96
- def pb_type(self) -> FieldType.ValueType:
97
- return FIELD_TYPE_STR_TO_PB[self.type]
98
-
99
- @classmethod
100
- def from_pb(
101
- cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
102
- ) -> "FieldId":
103
- return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
79
+ subfield_id: str | None = None
104
80
 
105
81
  @classmethod
106
82
  def from_string(cls, value: str) -> "FieldId":
@@ -120,11 +96,11 @@ class FieldId:
120
96
  parts = value.split("/")
121
97
  if len(parts) == 3:
122
98
  rid, _type, key = parts
123
- _type = cls.parse_field_type(_type)
99
+ _type = cls._parse_field_type(_type)
124
100
  return cls(rid=rid, type=_type, key=key)
125
101
  elif len(parts) == 4:
126
102
  rid, _type, key, subfield_id = parts
127
- _type = cls.parse_field_type(_type)
103
+ _type = cls._parse_field_type(_type)
128
104
  return cls(
129
105
  rid=rid,
130
106
  type=_type,
@@ -135,7 +111,49 @@ class FieldId:
135
111
  raise ValueError(f"Invalid FieldId: {value}")
136
112
 
137
113
  @classmethod
138
- def parse_field_type(cls, _type: str) -> str:
114
+ def from_pb(
115
+ cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: str | None = None
116
+ ) -> "FieldId":
117
+ return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
118
+
119
+ @property
120
+ def pb_type(self) -> FieldType.ValueType:
121
+ return FIELD_TYPE_STR_TO_PB[self.type]
122
+
123
+ def full(self) -> str:
124
+ if self.subfield_id is None:
125
+ return f"{self.rid}/{self.type}/{self.key}"
126
+ else:
127
+ return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
128
+
129
+ def full_without_subfield(self) -> str:
130
+ return f"{self.rid}/{self.type}/{self.key}"
131
+
132
+ def short_without_subfield(self) -> str:
133
+ return f"/{self.type}/{self.key}"
134
+
135
+ def paragraph_id(self, paragraph_start: int, paragraph_end: int) -> "ParagraphId":
136
+ """Generate a ParagraphId from the current field given its start and
137
+ end.
138
+
139
+ """
140
+ return ParagraphId(
141
+ field_id=self,
142
+ paragraph_start=paragraph_start,
143
+ paragraph_end=paragraph_end,
144
+ )
145
+
146
+ def __str__(self) -> str:
147
+ return self.full()
148
+
149
+ def __repr__(self) -> str:
150
+ return f"FieldId({self.full()})"
151
+
152
+ def __hash__(self) -> int:
153
+ return hash(self.full())
154
+
155
+ @staticmethod
156
+ def _parse_field_type(_type: str) -> str:
139
157
  if _type not in FIELD_TYPE_STR_TO_PB:
140
158
  # Try to parse the enum value
141
159
  # XXX: This is to support field types that are integer values of FieldType
@@ -157,19 +175,6 @@ class ParagraphId:
157
175
  paragraph_start: int
158
176
  paragraph_end: int
159
177
 
160
- def __repr__(self) -> str:
161
- return f"ParagraphId({self.full()})"
162
-
163
- def full(self) -> str:
164
- return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
165
-
166
- def __hash__(self) -> int:
167
- return hash(self.full())
168
-
169
- @property
170
- def rid(self) -> str:
171
- return self.field_id.rid
172
-
173
178
  @classmethod
174
179
  def from_string(cls, value: str) -> "ParagraphId":
175
180
  parts = value.split("/")
@@ -192,6 +197,22 @@ class ParagraphId:
192
197
  paragraph_end=vid.vector_end,
193
198
  )
194
199
 
200
+ @property
201
+ def rid(self) -> str:
202
+ return self.field_id.rid
203
+
204
+ def full(self) -> str:
205
+ return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
206
+
207
+ def __str__(self) -> str:
208
+ return self.full()
209
+
210
+ def __repr__(self) -> str:
211
+ return f"ParagraphId({self.full()})"
212
+
213
+ def __hash__(self) -> int:
214
+ return hash(self.full())
215
+
195
216
 
196
217
  @dataclass
197
218
  class VectorId:
@@ -217,19 +238,6 @@ class VectorId:
217
238
  vector_start: int
218
239
  vector_end: int
219
240
 
220
- def __repr__(self) -> str:
221
- return f"VectorId({self.full()})"
222
-
223
- def full(self) -> str:
224
- return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
225
-
226
- def __hash__(self) -> int:
227
- return hash(self.full())
228
-
229
- @property
230
- def rid(self) -> str:
231
- return self.field_id.rid
232
-
233
241
  @classmethod
234
242
  def from_string(cls, value: str) -> "VectorId":
235
243
  parts = value.split("/")
@@ -239,8 +247,24 @@ class VectorId:
239
247
  field_id = FieldId.from_string("/".join(parts[:-2]))
240
248
  return cls(field_id=field_id, index=index, vector_start=start, vector_end=end)
241
249
 
250
+ @property
251
+ def rid(self) -> str:
252
+ return self.field_id.rid
253
+
254
+ def full(self) -> str:
255
+ return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
256
+
257
+ def __str__(self) -> str:
258
+ return self.full()
259
+
260
+ def __repr__(self) -> str:
261
+ return f"VectorId({self.full()})"
262
+
263
+ def __hash__(self) -> int:
264
+ return hash(self.full())
265
+
242
266
 
243
- def extract_data_augmentation_id(generated_field_id: str) -> Optional[str]:
267
+ def extract_data_augmentation_id(generated_field_id: str) -> str | None:
244
268
  """Data augmentation generated fields have a strict id with the following
245
269
  format:
246
270
  `da-{task_id}-{original:field_type}-{original:field_id}[-{original:split}]`
@@ -22,7 +22,6 @@ import logging
22
22
  import time
23
23
  import uuid
24
24
  from dataclasses import dataclass
25
- from typing import Optional
26
25
 
27
26
  import orjson
28
27
 
@@ -99,7 +98,7 @@ class _Lock:
99
98
  self.task = asyncio.create_task(self._refresh_task())
100
99
  return self
101
100
 
102
- async def get_lock_data(self, txn: Transaction) -> Optional[LockValue]:
101
+ async def get_lock_data(self, txn: Transaction) -> LockValue | None:
103
102
  existing_data = await txn.get(self.key, for_update=True)
104
103
  if existing_data is None:
105
104
  return None
@@ -20,8 +20,9 @@
20
20
  from __future__ import annotations
21
21
 
22
22
  import asyncio
23
+ from collections.abc import AsyncGenerator
23
24
  from contextlib import asynccontextmanager
24
- from typing import AsyncGenerator, Optional
25
+ from typing import ClassVar
25
26
 
26
27
  DEFAULT_SCAN_LIMIT = -1
27
28
  DEFAULT_BATCH_SCAN_LIMIT = 500
@@ -37,10 +38,10 @@ class Transaction:
37
38
  async def commit(self):
38
39
  raise NotImplementedError()
39
40
 
40
- async def batch_get(self, keys: list[str], for_update: bool = False) -> list[Optional[bytes]]:
41
+ async def batch_get(self, keys: list[str], for_update: bool = False) -> list[bytes | None]:
41
42
  raise NotImplementedError()
42
43
 
43
- async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
44
+ async def get(self, key: str, for_update: bool = False) -> bytes | None:
44
45
  raise NotImplementedError()
45
46
 
46
47
  async def set(self, key: str, value: bytes):
@@ -57,7 +58,7 @@ class Transaction:
57
58
 
58
59
  def keys(
59
60
  self, match: str, count: int = DEFAULT_SCAN_LIMIT, include_start: bool = True
60
- ) -> AsyncGenerator[str, None]:
61
+ ) -> AsyncGenerator[str]:
61
62
  raise NotImplementedError()
62
63
 
63
64
  async def count(self, match: str) -> int:
@@ -66,7 +67,7 @@ class Transaction:
66
67
 
67
68
  class Driver:
68
69
  initialized = False
69
- _abort_tasks: list[asyncio.Task] = []
70
+ _abort_tasks: ClassVar[list[asyncio.Task]] = []
70
71
 
71
72
  async def initialize(self):
72
73
  raise NotImplementedError()
@@ -81,15 +82,15 @@ class Driver:
81
82
  pass
82
83
 
83
84
  @asynccontextmanager
84
- async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
85
+ async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction]:
85
86
  yield Transaction()
86
87
 
87
88
  @asynccontextmanager
88
- async def ro_transaction(self) -> AsyncGenerator[Transaction, None]:
89
+ async def ro_transaction(self) -> AsyncGenerator[Transaction]:
89
90
  async with self._transaction(read_only=True) as txn:
90
91
  yield txn
91
92
 
92
93
  @asynccontextmanager
93
- async def rw_transaction(self) -> AsyncGenerator[Transaction, None]:
94
+ async def rw_transaction(self) -> AsyncGenerator[Transaction]:
94
95
  async with self._transaction(read_only=False) as txn:
95
96
  yield txn
@@ -19,8 +19,8 @@
19
19
  #
20
20
  import glob
21
21
  import os
22
+ from collections.abc import AsyncGenerator
22
23
  from contextlib import asynccontextmanager
23
- from typing import AsyncGenerator, Optional
24
24
 
25
25
  from nucliadb.common.maindb.driver import (
26
26
  DEFAULT_BATCH_SCAN_LIMIT,
@@ -78,7 +78,7 @@ class LocalTransaction(Transaction):
78
78
  # Deleting a key that does not exist
79
79
  pass
80
80
 
81
- async def read(self, key: str) -> Optional[bytes]:
81
+ async def read(self, key: str) -> bytes | None:
82
82
  try:
83
83
  async with aiofiles.open(self.compute_path(key), "rb") as resp:
84
84
  return await resp.read()
@@ -106,8 +106,8 @@ class LocalTransaction(Transaction):
106
106
  self.clean()
107
107
  self.open = False
108
108
 
109
- async def batch_get(self, keys: list[str], for_update: bool = False) -> list[Optional[bytes]]:
110
- results: list[Optional[bytes]] = []
109
+ async def batch_get(self, keys: list[str], for_update: bool = False) -> list[bytes | None]:
110
+ results: list[bytes | None] = []
111
111
  for key in keys:
112
112
  obj = await self.get(key)
113
113
  if obj:
@@ -125,7 +125,7 @@ class LocalTransaction(Transaction):
125
125
 
126
126
  return results
127
127
 
128
- async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
128
+ async def get(self, key: str, for_update: bool = False) -> bytes | None:
129
129
  if key in self.deleted_keys:
130
130
  raise KeyError(f"Not found {key}")
131
131
 
@@ -21,8 +21,9 @@ from __future__ import annotations
21
21
 
22
22
  import asyncio
23
23
  import logging
24
+ from collections.abc import AsyncGenerator
24
25
  from contextlib import asynccontextmanager
25
- from typing import Any, AsyncGenerator, Optional
26
+ from typing import Any
26
27
 
27
28
  import backoff
28
29
  import psycopg
@@ -72,7 +73,7 @@ class DataLayer:
72
73
  self.connection = connection
73
74
  self.log_on_select_for_update = settings.driver_pg_log_on_select_for_update
74
75
 
75
- async def get(self, key: str, select_for_update: bool = False) -> Optional[bytes]:
76
+ async def get(self, key: str, select_for_update: bool = False) -> bytes | None:
76
77
  with pg_observer({"type": "get"}):
77
78
  statement = "SELECT value FROM resources WHERE key = %s"
78
79
  if select_for_update:
@@ -116,7 +117,7 @@ class DataLayer:
116
117
  async with self.connection.cursor() as cur:
117
118
  await cur.execute("DELETE FROM resources WHERE key LIKE %s", (prefix + "%",))
118
119
 
119
- async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[Optional[bytes]]:
120
+ async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[bytes | None]:
120
121
  with pg_observer({"type": "batch_get"}):
121
122
  async with self.connection.cursor() as cur:
122
123
  statement = "SELECT key, value FROM resources WHERE key = ANY(%s)"
@@ -134,7 +135,7 @@ class DataLayer:
134
135
  prefix: str,
135
136
  limit: int = DEFAULT_SCAN_LIMIT,
136
137
  include_start: bool = True,
137
- ) -> AsyncGenerator[str, None]:
138
+ ) -> AsyncGenerator[str]:
138
139
  query = "SELECT key FROM resources WHERE key LIKE %s ORDER BY key"
139
140
 
140
141
  args: list[Any] = [prefix + "%"]
@@ -190,7 +191,7 @@ class PGTransaction(Transaction):
190
191
  async def batch_get(self, keys: list[str], for_update: bool = True):
191
192
  return await self.data_layer.batch_get(keys, select_for_update=for_update)
192
193
 
193
- async def get(self, key: str, for_update: bool = True) -> Optional[bytes]:
194
+ async def get(self, key: str, for_update: bool = True) -> bytes | None:
194
195
  return await self.data_layer.get(key, select_for_update=for_update)
195
196
 
196
197
  async def set(self, key: str, value: bytes):
@@ -243,7 +244,7 @@ class ReadOnlyPGTransaction(Transaction):
243
244
  return await DataLayer(conn).batch_get(keys, select_for_update=False)
244
245
 
245
246
  @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
246
- async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
247
+ async def get(self, key: str, for_update: bool = False) -> bytes | None:
247
248
  async with self.driver._get_connection() as conn:
248
249
  return await DataLayer(conn).get(key, select_for_update=False)
249
250
 
@@ -330,7 +331,7 @@ class PGDriver(Driver):
330
331
  metric.set(value)
331
332
 
332
333
  @asynccontextmanager
333
- async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
334
+ async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction]:
334
335
  if read_only:
335
336
  yield ReadOnlyPGTransaction(self)
336
337
  else:
@@ -343,7 +344,7 @@ class PGDriver(Driver):
343
344
  await txn.abort()
344
345
 
345
346
  @asynccontextmanager
346
- async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection, None]:
347
+ async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection]:
347
348
  timeout = self.acquire_timeout_ms / 1000
348
349
  # Manual retry loop since backoff.on_exception does not play well with async context managers
349
350
  retries = 0
nucliadb/common/nidx.py CHANGED
@@ -19,7 +19,6 @@
19
19
  #
20
20
 
21
21
  import os
22
- from typing import Optional, Union
23
22
 
24
23
  from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxIndexerStub, NidxSearcherStub
25
24
  from nidx_protos.nodewriter_pb2 import (
@@ -54,7 +53,7 @@ class NidxUtility:
54
53
  pass
55
54
 
56
55
 
57
- def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
56
+ def _storage_config(prefix: str, bucket: str | None) -> dict[str, str]:
58
57
  config = {}
59
58
  if storage_settings.file_backend == FileBackendConfig.LOCAL:
60
59
  local_bucket = bucket or storage_settings.local_indexing_bucket
@@ -82,6 +81,24 @@ def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
82
81
  config[f"{prefix}__REGION_NAME"] = storage_settings.s3_region_name or ""
83
82
  if storage_settings.s3_endpoint:
84
83
  config[f"{prefix}__ENDPOINT"] = storage_settings.s3_endpoint
84
+ elif storage_settings.file_backend == FileBackendConfig.AZURE:
85
+ if storage_settings.azure_account_url is None:
86
+ raise ValueError("Azure account is required")
87
+ config[f"{prefix}__OBJECT_STORE"] = "azure"
88
+ url = storage_settings.azure_account_url
89
+ container = bucket or extended_storage_settings.azure_indexing_bucket
90
+ if container:
91
+ url += f"/{container}"
92
+ config[f"{prefix}__CONTAINER_URL"] = url
93
+ if storage_settings.azure_connection_string:
94
+ params = {
95
+ p.split("=", 1)[0]: p.split("=", 1)[1]
96
+ for p in storage_settings.azure_connection_string.split(";")
97
+ }
98
+ if "AccountKey" in params:
99
+ config[f"{prefix}__ACCOUNT_KEY"] = params["AccountKey"]
100
+ if "BlobEndpoint" in params:
101
+ config[f"{prefix}__ENDPOINT"] = params["BlobEndpoint"]
85
102
 
86
103
  return config
87
104
 
@@ -143,7 +160,7 @@ class NidxNatsIndexer:
143
160
  async def index(self, writer: IndexMessage) -> int:
144
161
  res = await self.nats_connection_manager.js.publish(self.subject, writer.SerializeToString())
145
162
  logger.info(
146
- f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid} seqid: {res.seq}" # noqa
163
+ f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid} seqid: {res.seq}"
147
164
  )
148
165
  return res.seq
149
166
 
@@ -167,7 +184,7 @@ class NidxGrpcIndexer:
167
184
  class NidxServiceUtility(NidxUtility):
168
185
  """Implements Nidx utility connecting to the network service"""
169
186
 
170
- indexer: Union[NidxNatsIndexer, NidxGrpcIndexer]
187
+ indexer: NidxNatsIndexer | NidxGrpcIndexer
171
188
 
172
189
  def __init__(self, service_name: str):
173
190
  self.service_name = service_name
@@ -198,7 +215,7 @@ class NidxServiceUtility(NidxUtility):
198
215
  return await self.indexer.index(writer)
199
216
 
200
217
 
201
- async def start_nidx_utility(service_name: str = "nucliadb.nidx") -> Optional[NidxUtility]:
218
+ async def start_nidx_utility(service_name: str = "nucliadb.nidx") -> NidxUtility:
202
219
  nidx = get_utility(Utility.NIDX)
203
220
  if nidx:
204
221
  return nidx
@@ -26,7 +26,7 @@ from nucliadb_protos import knowledgebox_pb2 as Nucliadb
26
26
  def nucliadb_vector_type_to_nidx(nucliadb: Nucliadb.VectorType.ValueType) -> Nidx.VectorType.ValueType:
27
27
  if nucliadb == Nucliadb.DENSE_F32:
28
28
  return Nidx.DENSE_F32
29
- else: # pragma: nocover
29
+ else: # pragma: no cover
30
30
  raise Exception("Unknown vector type")
31
31
 
32
32
 
@@ -18,8 +18,9 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import json
21
+ from collections.abc import AsyncGenerator
21
22
  from datetime import datetime, timezone
22
- from typing import AsyncGenerator, Type, Union, cast
23
+ from typing import Type, cast
23
24
 
24
25
  from nucliadb.common.maindb.driver import Driver
25
26
  from nucliadb.export_import import logger
@@ -34,7 +35,7 @@ MAINDB_IMPORT_KEY = "/kbs/{kbid}/imports/{id}"
34
35
  STORAGE_EXPORT_KEY = "exports/{export_id}"
35
36
  STORAGE_IMPORT_KEY = "imports/{import_id}"
36
37
 
37
- Metadata = Union[ExportMetadata, ImportMetadata]
38
+ Metadata = ExportMetadata | ImportMetadata
38
39
 
39
40
 
40
41
  class ExportImportDataManager:
@@ -59,7 +60,7 @@ class ExportImportDataManager:
59
60
  if data is None or data == b"":
60
61
  raise MetadataNotFound()
61
62
  decoded = data.decode("utf-8")
62
- model_type: Union[Type[ExportMetadata], Type[ImportMetadata]]
63
+ model_type: Type[ExportMetadata] | Type[ImportMetadata]
63
64
  if type == "export":
64
65
  model_type = ExportMetadata
65
66
  elif type == "import":
@@ -18,11 +18,12 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import AsyncGenerator, Optional
21
+ from collections.abc import AsyncGenerator
22
22
 
23
23
  from nucliadb.common.context import ApplicationContext
24
24
  from nucliadb.export_import import logger
25
25
  from nucliadb.export_import.datamanager import ExportImportDataManager
26
+ from nucliadb.export_import.exceptions import MetadataNotFound
26
27
  from nucliadb.export_import.models import (
27
28
  ExportedItemType,
28
29
  ExportMetadata,
@@ -33,7 +34,6 @@ from nucliadb.export_import.utils import (
33
34
  download_binary,
34
35
  get_broker_message,
35
36
  get_cloud_files,
36
- get_entities,
37
37
  get_labels,
38
38
  get_learning_config,
39
39
  iter_kb_resource_uuids,
@@ -43,7 +43,7 @@ from nucliadb_telemetry import errors
43
43
 
44
44
 
45
45
  async def export_kb(
46
- context: ApplicationContext, kbid: str, metadata: Optional[ExportMetadata] = None
46
+ context: ApplicationContext, kbid: str, metadata: ExportMetadata | None = None
47
47
  ) -> AsyncGenerator[bytes, None]:
48
48
  """Export the data of a knowledgebox to a stream of bytes.
49
49
 
@@ -63,9 +63,6 @@ async def export_kb(
63
63
  async for chunk in resources_iterator:
64
64
  yield chunk
65
65
 
66
- async for chunk in export_entities(context, kbid):
67
- yield chunk
68
-
69
66
  async for chunk in export_labels(context, kbid):
70
67
  yield chunk
71
68
 
@@ -76,7 +73,14 @@ async def export_kb_to_blob_storage(context: ApplicationContext, msg: NatsTaskMe
76
73
  """
77
74
  kbid, export_id = msg.kbid, msg.id
78
75
  dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
79
- metadata = await dm.get_export_metadata(kbid=kbid, id=export_id)
76
+ try:
77
+ metadata = await dm.get_export_metadata(kbid=kbid, id=export_id)
78
+ except MetadataNotFound: # pragma: no cover
79
+ logger.error(
80
+ "Export metadata not found. Skipping export.", extra={"kbid": kbid, "export_id": export_id}
81
+ )
82
+ return
83
+
80
84
  iterator = export_kb(context, kbid, metadata)
81
85
 
82
86
  retry_handler = TaskRetryHandler("export", dm, metadata)
@@ -167,18 +171,6 @@ async def export_resource_with_binaries(
167
171
  yield bm_bytes
168
172
 
169
173
 
170
- async def export_entities(
171
- context: ApplicationContext,
172
- kbid: str,
173
- ) -> AsyncGenerator[bytes, None]:
174
- entities = await get_entities(context, kbid)
175
- if len(entities.entities_groups) > 0:
176
- data = entities.SerializeToString()
177
- yield ExportedItemType.ENTITIES.encode("utf-8")
178
- yield len(data).to_bytes(4, byteorder="big")
179
- yield data
180
-
181
-
182
174
  async def export_labels(
183
175
  context: ApplicationContext,
184
176
  kbid: str,
@@ -17,11 +17,13 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import AsyncGenerator, Callable, Optional, cast
20
+ from collections.abc import AsyncGenerator, Callable
21
+ from typing import cast
21
22
 
22
23
  from nucliadb.common.context import ApplicationContext
23
24
  from nucliadb.export_import import logger
24
25
  from nucliadb.export_import.datamanager import ExportImportDataManager
26
+ from nucliadb.export_import.exceptions import MetadataNotFound
25
27
  from nucliadb.export_import.models import (
26
28
  ExportedItemType,
27
29
  ImportMetadata,
@@ -32,7 +34,6 @@ from nucliadb.export_import.utils import (
32
34
  TaskRetryHandler,
33
35
  import_binary,
34
36
  restore_broker_message,
35
- set_entities_groups,
36
37
  set_labels,
37
38
  )
38
39
  from nucliadb_protos import knowledgebox_pb2 as kb_pb2
@@ -46,7 +47,7 @@ async def import_kb(
46
47
  context: ApplicationContext,
47
48
  kbid: str,
48
49
  stream: AsyncGenerator[bytes, None],
49
- metadata: Optional[ImportMetadata] = None,
50
+ metadata: ImportMetadata | None = None,
50
51
  ) -> None:
51
52
  """
52
53
  Imports exported data from a stream into a knowledgebox.
@@ -72,8 +73,8 @@ async def import_kb(
72
73
  await import_binary(context, kbid, cf, binary_generator)
73
74
 
74
75
  elif item_type == ExportedItemType.ENTITIES:
75
- entities = cast(kb_pb2.EntitiesGroups, data)
76
- await set_entities_groups(context, kbid, entities)
76
+ # This is not supported anymore, we ignore it if we find it in and old backup
77
+ pass
77
78
 
78
79
  elif item_type == ExportedItemType.LABELS:
79
80
  labels = cast(kb_pb2.Labels, data)
@@ -99,7 +100,13 @@ async def import_kb_from_blob_storage(context: ApplicationContext, msg: NatsTask
99
100
  """
100
101
  kbid, import_id = msg.kbid, msg.id
101
102
  dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
102
- metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
103
+ try:
104
+ metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
105
+ except MetadataNotFound: # pragma: no cover
106
+ logger.error(
107
+ "Import metadata not found. Skipping import.", extra={"kbid": kbid, "import_id": import_id}
108
+ )
109
+ return
103
110
 
104
111
  retry_handler = TaskRetryHandler("import", dm, metadata)
105
112
 
@@ -56,6 +56,7 @@ def get_exports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
56
56
  callback=export_kb_to_blob_storage,
57
57
  msg_type=NatsTaskMessage,
58
58
  max_concurrent_messages=10,
59
+ max_retries=100,
59
60
  )
60
61
 
61
62
 
@@ -77,6 +78,7 @@ def get_imports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
77
78
  callback=import_kb_from_blob_storage,
78
79
  msg_type=NatsTaskMessage,
79
80
  max_concurrent_messages=10,
81
+ max_retries=100,
80
82
  )
81
83
 
82
84