nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -18,9 +18,9 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import logging
21
+ from collections.abc import Iterator
21
22
  from copy import deepcopy
22
23
  from dataclasses import dataclass
23
- from typing import Optional
24
24
 
25
25
  from nidx_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
26
26
  from nidx_protos.noderesources_pb2 import (
@@ -81,9 +81,9 @@ class ResourceBrain:
81
81
  self,
82
82
  basic: Basic,
83
83
  user_relations: Relations,
84
- origin: Optional[Origin],
85
- previous_processing_status: Optional[Metadata.Status.ValueType],
86
- security: Optional[utils_pb2.Security],
84
+ origin: Origin | None,
85
+ previous_processing_status: Metadata.Status.ValueType | None,
86
+ security: utils_pb2.Security | None,
87
87
  ) -> None:
88
88
  self._set_resource_status(basic, previous_processing_status)
89
89
  self._set_resource_dates(basic, origin)
@@ -97,9 +97,9 @@ class ResourceBrain:
97
97
  self,
98
98
  field_key: str,
99
99
  extracted_text: ExtractedText,
100
- field_computed_metadata: Optional[FieldComputedMetadata],
101
- basic_user_metadata: Optional[UserMetadata],
102
- field_author: Optional[FieldAuthor],
100
+ field_computed_metadata: FieldComputedMetadata | None,
101
+ basic_user_metadata: UserMetadata | None,
102
+ field_author: FieldAuthor | None,
103
103
  replace_field: bool,
104
104
  skip_index: bool,
105
105
  ) -> None:
@@ -122,13 +122,17 @@ class ResourceBrain:
122
122
  field_key: str,
123
123
  extracted_text: ExtractedText,
124
124
  replace_field: bool,
125
- skip_texts: Optional[bool],
125
+ skip_texts: bool | None,
126
126
  ):
127
127
  if skip_texts is not None:
128
128
  self.brain.skip_texts = skip_texts
129
+
129
130
  field_text = extracted_text.text
130
- for _, split in extracted_text.split_text.items():
131
- field_text += f" {split} "
131
+
132
+ for split_id in self.sorted_splits(extracted_text):
133
+ split_text = extracted_text.split_text[split_id]
134
+ field_text += f"{split_text} "
135
+
132
136
  self.brain.texts[field_key].text = field_text
133
137
 
134
138
  if replace_field:
@@ -140,18 +144,16 @@ class ResourceBrain:
140
144
  def apply_field_labels(
141
145
  self,
142
146
  field_key: str,
143
- field_computed_metadata: Optional[FieldComputedMetadata],
144
- field_author: Optional[FieldAuthor],
145
- basic_user_metadata: Optional[UserMetadata] = None,
147
+ field_computed_metadata: FieldComputedMetadata | None,
148
+ field_author: FieldAuthor | None,
149
+ basic_user_metadata: UserMetadata | None = None,
146
150
  ):
147
151
  user_cancelled_labels: set[str] = (
148
- set(
149
- [
150
- f"{classification.labelset}/{classification.label}"
151
- for classification in basic_user_metadata.classifications
152
- if classification.cancelled_by_user
153
- ]
154
- )
152
+ {
153
+ f"{classification.labelset}/{classification.label}"
154
+ for classification in basic_user_metadata.classifications
155
+ if classification.cancelled_by_user
156
+ }
155
157
  if basic_user_metadata
156
158
  else set()
157
159
  )
@@ -212,13 +214,17 @@ class ResourceBrain:
212
214
  field_key: str,
213
215
  field_computed_metadata: FieldComputedMetadata,
214
216
  extracted_text: ExtractedText,
215
- page_positions: Optional[FilePagePositions],
216
- user_field_metadata: Optional[UserFieldMetadata],
217
+ page_positions: FilePagePositions | None,
218
+ user_field_metadata: UserFieldMetadata | None,
217
219
  replace_field: bool,
218
- skip_paragraphs_index: Optional[bool],
219
- skip_texts_index: Optional[bool],
220
- append_splits: Optional[set[str]] = None,
220
+ skip_paragraphs_index: bool | None,
221
+ skip_texts_index: bool | None,
222
+ append_splits: set[str] | None = None,
221
223
  ) -> None:
224
+ """
225
+ append_splits: when provided, only the splits in this set will be indexed. This is used for conversation appends, to
226
+ avoid reindexing all previous messages of the conversation.
227
+ """
222
228
  # We need to add the extracted text to the texts section of the Resource so that
223
229
  # the paragraphs can be indexed
224
230
  self.apply_field_text(
@@ -238,30 +244,41 @@ class ResourceBrain:
238
244
  append_splits=append_splits,
239
245
  )
240
246
 
247
+ def sorted_splits(self, extracted_text: ExtractedText) -> Iterator[str]:
248
+ yield from sorted(extracted_text.split_text.keys())
249
+
241
250
  @observer.wrap({"type": "apply_field_paragraphs"})
242
251
  def apply_field_paragraphs(
243
252
  self,
244
253
  field_key: str,
245
254
  field_computed_metadata: FieldComputedMetadata,
246
255
  extracted_text: ExtractedText,
247
- page_positions: Optional[FilePagePositions],
248
- user_field_metadata: Optional[UserFieldMetadata],
256
+ page_positions: FilePagePositions | None,
257
+ user_field_metadata: UserFieldMetadata | None,
249
258
  replace_field: bool,
250
- skip_paragraphs: Optional[bool],
251
- append_splits: Optional[set[str]] = None,
259
+ skip_paragraphs: bool | None,
260
+ append_splits: set[str] | None = None,
252
261
  ) -> None:
253
262
  if skip_paragraphs is not None:
254
263
  self.brain.skip_paragraphs = skip_paragraphs
255
264
  unique_paragraphs: set[str] = set()
256
265
  user_paragraph_classifications = self._get_paragraph_user_classifications(user_field_metadata)
257
266
  paragraph_pages = ParagraphPages(page_positions) if page_positions else None
267
+
258
268
  # Splits of the field
259
- for subfield, field_metadata in field_computed_metadata.split_metadata.items():
260
- if should_skip_split_indexing(subfield, replace_field, append_splits):
261
- continue
262
- if subfield not in extracted_text.split_text:
263
- # No extracted text for this split
269
+
270
+ # Used to adjust the paragraph start/end when indexing splits, as they are all
271
+ # concatenated in the main text part of the brain Resource.
272
+ split_offset = 0
273
+ for subfield in self.sorted_splits(extracted_text):
274
+ if subfield not in field_computed_metadata.split_metadata or should_skip_split_indexing(
275
+ subfield, replace_field, append_splits
276
+ ):
277
+ # We're skipping this split but we need to adjust the offset as we have added the text
278
+ # of this split to the main text
279
+ split_offset += len(extracted_text.split_text[subfield]) + 1 # +1 for the space
264
280
  continue
281
+ field_metadata = field_computed_metadata.split_metadata[subfield]
265
282
  extracted_text_str = extracted_text.split_text[subfield]
266
283
  for idx, paragraph in enumerate(field_metadata.paragraphs):
267
284
  key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
@@ -288,8 +305,8 @@ class ResourceBrain:
288
305
  representation.file = paragraph.representation.reference_file
289
306
  representation.is_a_table = paragraph.representation.is_a_table
290
307
  p = BrainParagraph(
291
- start=paragraph.start,
292
- end=paragraph.end,
308
+ start=paragraph.start + split_offset,
309
+ end=paragraph.end + split_offset,
293
310
  field=field_key,
294
311
  split=subfield,
295
312
  index=idx,
@@ -304,6 +321,7 @@ class ResourceBrain:
304
321
  representation=representation,
305
322
  ),
306
323
  )
324
+ split_offset = p.end + 1 # +1 for the space
307
325
  paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
308
326
  paragraph_labels = {paragraph_kind_label}
309
327
  paragraph_labels.update(
@@ -372,7 +390,7 @@ class ResourceBrain:
372
390
  self.brain.paragraphs_to_delete.append(full_field_id)
373
391
 
374
392
  def _get_paragraph_user_classifications(
375
- self, basic_user_field_metadata: Optional[UserFieldMetadata]
393
+ self, basic_user_field_metadata: UserFieldMetadata | None
376
394
  ) -> ParagraphClassifications:
377
395
  pc = ParagraphClassifications(valid={}, denied={})
378
396
  if basic_user_field_metadata is None:
@@ -391,18 +409,16 @@ class ResourceBrain:
391
409
  def generate_relations(
392
410
  self,
393
411
  field_key: str,
394
- field_computed_metadata: Optional[FieldComputedMetadata],
395
- basic_user_metadata: Optional[UserMetadata],
412
+ field_computed_metadata: FieldComputedMetadata | None,
413
+ basic_user_metadata: UserMetadata | None,
396
414
  replace_field: bool,
397
415
  ) -> None:
398
416
  user_cancelled_labels: set[str] = (
399
- set(
400
- [
401
- f"{classification.labelset}/{classification.label}"
402
- for classification in basic_user_metadata.classifications
403
- if classification.cancelled_by_user
404
- ]
405
- )
417
+ {
418
+ f"{classification.labelset}/{classification.label}"
419
+ for classification in basic_user_metadata.classifications
420
+ if classification.cancelled_by_user
421
+ }
406
422
  if basic_user_metadata
407
423
  else set()
408
424
  )
@@ -491,7 +507,7 @@ class ResourceBrain:
491
507
  full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
492
508
  self.brain.texts_to_delete.append(full_field_id)
493
509
  self.brain.paragraphs_to_delete.append(full_field_id)
494
- self.brain.sentences_to_delete.append(full_field_id)
510
+ self.brain.vectors_to_delete_in_all_vectorsets.append(full_field_id)
495
511
  self.brain.relation_fields_to_delete.append(field_key)
496
512
 
497
513
  @observer.wrap({"type": "generate_vectors"})
@@ -503,8 +519,8 @@ class ResourceBrain:
503
519
  vectorset: str,
504
520
  replace_field: bool = False,
505
521
  # cut to specific dimension if specified
506
- vector_dimension: Optional[int] = None,
507
- append_splits: Optional[set[str]] = None,
522
+ vector_dimension: int | None = None,
523
+ append_splits: set[str] | None = None,
508
524
  ):
509
525
  fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
510
526
  for subfield, vectors in vo.split_vectors.items():
@@ -578,7 +594,7 @@ class ResourceBrain:
578
594
  *,
579
595
  vectorset: str,
580
596
  # cut vectors if a specific dimension is specified
581
- vector_dimension: Optional[int] = None,
597
+ vector_dimension: int | None = None,
582
598
  ):
583
599
  paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
584
600
  sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
@@ -603,7 +619,7 @@ class ResourceBrain:
603
619
 
604
620
  sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
605
621
 
606
- def _set_resource_status(self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]):
622
+ def _set_resource_status(self, basic: Basic, previous_status: Metadata.Status.ValueType | None):
607
623
  """
608
624
  We purposefully overwrite what we index as a status and DO NOT reflect
609
625
  actual status with what we index.
@@ -633,32 +649,32 @@ class ResourceBrain:
633
649
  return "EMPTY"
634
650
  return METADATA_STATUS_PB_TYPE_TO_NAME_MAP[metadata.status]
635
651
 
636
- def _set_resource_dates(self, basic: Basic, origin: Optional[Origin]):
652
+ def _set_resource_dates(self, basic: Basic, origin: Origin | None):
637
653
  """
638
654
  Adds the user-defined dates to the brain object. This is at resource level and applies to
639
655
  all fields of the resource.
640
656
  """
641
- if basic.created.seconds > 0:
657
+ if basic.created.seconds != 0:
642
658
  self.brain.metadata.created.CopyFrom(basic.created)
643
659
  else:
644
660
  logging.warning(f"Basic metadata has no created field for {self.rid}")
645
661
  self.brain.metadata.created.GetCurrentTime()
646
- if basic.modified.seconds > 0:
662
+ if basic.modified.seconds != 0:
647
663
  self.brain.metadata.modified.CopyFrom(basic.modified)
648
664
  else:
649
- if basic.created.seconds > 0:
665
+ if basic.created.seconds != 0:
650
666
  self.brain.metadata.modified.CopyFrom(basic.created)
651
667
  else:
652
668
  self.brain.metadata.modified.GetCurrentTime()
653
669
 
654
670
  if origin is not None:
655
671
  # overwrite created/modified if provided on origin
656
- if origin.HasField("created") and origin.created.seconds > 0:
672
+ if origin.HasField("created") and origin.created.seconds != 0:
657
673
  self.brain.metadata.created.CopyFrom(origin.created)
658
- if origin.HasField("modified") and origin.modified.seconds > 0:
674
+ if origin.HasField("modified") and origin.modified.seconds != 0:
659
675
  self.brain.metadata.modified.CopyFrom(origin.modified)
660
676
 
661
- def _set_resource_relations(self, basic: Basic, origin: Optional[Origin], user_relations: Relations):
677
+ def _set_resource_relations(self, basic: Basic, origin: Origin | None, user_relations: Relations):
662
678
  """
663
679
  Adds the relations to the brain object corresponding to the user-defined metadata at the resource level:
664
680
  - Contributors of the document
@@ -702,7 +718,7 @@ class ResourceBrain:
702
718
 
703
719
  self.brain.relation_fields_to_delete.append("a/metadata")
704
720
 
705
- def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
721
+ def _set_resource_labels(self, basic: Basic, origin: Origin | None):
706
722
  """
707
723
  Adds the resource-level labels to the brain object.
708
724
  These levels are user-defined in basic or origin metadata.
@@ -759,7 +775,7 @@ class ResourceBrain:
759
775
 
760
776
  def is_paragraph_repeated_in_field(
761
777
  paragraph: Paragraph,
762
- extracted_text: Optional[str],
778
+ extracted_text: str | None,
763
779
  unique_paragraphs: set[str],
764
780
  ) -> bool:
765
781
  if extracted_text is None:
@@ -798,15 +814,13 @@ class ParagraphPages:
798
814
  return self._materialized[paragraph_start_index]
799
815
  except IndexError:
800
816
  logger.error(
801
- f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}" # noqa
817
+ f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}"
802
818
  )
803
819
  if len(self._materialized) > 0:
804
820
  return self._materialized[-1]
805
821
  return 0
806
822
 
807
823
 
808
- def should_skip_split_indexing(
809
- split: str, replace_field: bool, append_splits: Optional[set[str]]
810
- ) -> bool:
824
+ def should_skip_split_indexing(split: str, replace_field: bool, append_splits: set[str] | None) -> bool:
811
825
  # When replacing the whole field, reindex all splits. Otherwise, we're only indexing the splits that are appended
812
826
  return not replace_field and append_splits is not None and split not in append_splits
@@ -56,7 +56,7 @@ class _BrokerMessageBuilder:
56
56
  # clear the state and generate a new broker message
57
57
  self.bm.Clear()
58
58
 
59
- self.bm.kbid = resource.kb.kbid
59
+ self.bm.kbid = resource.kbid
60
60
  self.bm.uuid = resource.uuid
61
61
  basic = await resource.get_basic()
62
62
  if basic is not None:
@@ -93,9 +93,7 @@ class _BrokerMessageBuilder:
93
93
  self.bm.link_extracted_data.append(link_extracted_data)
94
94
 
95
95
  # Field vectors
96
- async for vectorset_id, vs in datamanagers.vectorsets.iter(
97
- resource.txn, kbid=resource.kb.kbid
98
- ):
96
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(resource.txn, kbid=resource.kbid):
99
97
  await self.generate_field_vectors(
100
98
  type_id, field_id, field, vectorset_id, vs.storage_key_kind
101
99
  )
@@ -18,8 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- import asyncio
22
- from typing import AsyncGenerator, Optional
21
+ from collections.abc import AsyncGenerator
23
22
 
24
23
  from nidx_protos.nodereader_pb2 import (
25
24
  Faceted,
@@ -29,23 +28,12 @@ from nidx_protos.nodereader_pb2 import (
29
28
  SearchResponse,
30
29
  )
31
30
 
32
- from nucliadb.common import datamanagers
33
- from nucliadb.common.cluster.exceptions import (
34
- AlreadyExists,
35
- EntitiesGroupNotFound,
36
- )
37
31
  from nucliadb.common.cluster.utils import get_shard_manager
38
- from nucliadb.common.datamanagers.entities import (
39
- KB_DELETED_ENTITIES_GROUPS,
40
- KB_ENTITIES,
41
- KB_ENTITIES_GROUP,
42
- )
43
32
  from nucliadb.common.maindb.driver import Transaction
44
33
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
45
34
  from nucliadb.ingest.settings import settings
46
35
  from nucliadb.search.search.shards import graph_search_shard, query_shard
47
36
  from nucliadb_protos.knowledgebox_pb2 import (
48
- DeletedEntitiesGroups,
49
37
  EntitiesGroup,
50
38
  EntitiesGroupSummary,
51
39
  Entity,
@@ -53,8 +41,6 @@ from nucliadb_protos.knowledgebox_pb2 import (
53
41
  from nucliadb_protos.utils_pb2 import RelationNode
54
42
  from nucliadb_protos.writer_pb2 import GetEntitiesResponse
55
43
 
56
- from .exceptions import EntityManagementException
57
-
58
44
  MAX_DUPLICATES = 300
59
45
  MAX_DELETED = 300
60
46
 
@@ -69,20 +55,11 @@ class EntitiesManager:
69
55
  self.txn = txn
70
56
  self.kbid = self.kb.kbid
71
57
 
72
- async def create_entities_group(self, group: str, entities: EntitiesGroup):
73
- if await self.entities_group_exists(group):
74
- raise AlreadyExists(f"Entities group {group} already exists")
75
-
76
- await self.store_entities_group(group, entities)
77
-
78
58
  async def get_entities(self, entities: GetEntitiesResponse):
79
59
  async for group, eg in self.iterate_entities_groups(exclude_deleted=True):
80
60
  entities.groups[group].CopyFrom(eg)
81
61
 
82
- async def get_entities_group(self, group: str) -> Optional[EntitiesGroup]:
83
- deleted = await self.is_entities_group_deleted(group)
84
- if deleted:
85
- return None
62
+ async def get_entities_group(self, group: str) -> EntitiesGroup | None:
86
63
  return await self.get_entities_group_inner(group)
87
64
 
88
65
  async def get_entities_groups(self) -> dict[str, EntitiesGroup]:
@@ -93,113 +70,18 @@ class EntitiesManager:
93
70
 
94
71
  async def list_entities_groups(self) -> dict[str, EntitiesGroupSummary]:
95
72
  groups = {}
96
- max_simultaneous = asyncio.Semaphore(10)
97
73
 
98
- async def _composition(group: str):
99
- async with max_simultaneous:
100
- stored = await self.get_stored_entities_group(group)
101
- if stored is not None:
102
- groups[group] = EntitiesGroupSummary(
103
- title=stored.title, color=stored.color, custom=stored.custom
104
- )
105
- else:
106
- # We don't want to search for each indexed group, as we are
107
- # providing a quick summary
108
- groups[group] = EntitiesGroupSummary()
74
+ async for group in self.iterate_entities_groups_names(exclude_deleted=True):
75
+ groups[group] = EntitiesGroupSummary()
109
76
 
110
- tasks = [
111
- asyncio.create_task(_composition(group))
112
- async for group in self.iterate_entities_groups_names(exclude_deleted=True)
113
- ]
114
- if tasks:
115
- await asyncio.wait(tasks)
116
77
  return groups
117
78
 
118
- async def update_entities(self, group: str, entities: dict[str, Entity]):
119
- """Update entities on an entity group. New entities are appended and existing
120
- are overwriten. Existing entities not appearing in `entities` are left
121
- intact. Use `delete_entities` to delete them instead.
122
-
123
- """
124
- if not await self.entities_group_exists(group):
125
- raise EntitiesGroupNotFound(f"Entities group '{group}' doesn't exist")
126
-
127
- entities_group = await self.get_stored_entities_group(group)
128
- if entities_group is None:
129
- entities_group = EntitiesGroup()
130
-
131
- for name, entity in entities.items():
132
- entities_group.entities[name].CopyFrom(entity)
133
-
134
- await self.store_entities_group(group, entities_group)
135
-
136
- async def set_entities_group(self, group: str, entities: EntitiesGroup):
137
- indexed = await self.get_indexed_entities_group(group)
138
- if indexed is None:
139
- updated = entities
140
- else:
141
- updated = EntitiesGroup()
142
- updated.CopyFrom(entities)
143
-
144
- for name, entity in indexed.entities.items():
145
- if name not in updated.entities:
146
- updated.entities[name].CopyFrom(entity)
147
- updated.entities[name].deleted = True
148
-
149
- await self.store_entities_group(group, updated)
150
-
151
- async def set_entities_group_force(self, group: str, entitiesgroup: EntitiesGroup):
152
- await self.store_entities_group(group, entitiesgroup)
153
-
154
- async def set_entities_group_metadata(
155
- self, group: str, *, title: Optional[str] = None, color: Optional[str] = None
156
- ):
157
- entities_group = await self.get_stored_entities_group(group)
158
- if entities_group is None:
159
- entities_group = EntitiesGroup()
160
-
161
- if title:
162
- entities_group.title = title
163
- if color:
164
- entities_group.color = color
165
-
166
- await self.store_entities_group(group, entities_group)
167
-
168
- async def delete_entities(self, group: str, delete: list[str]):
169
- stored = await self.get_stored_entities_group(group)
170
-
171
- stored = stored or EntitiesGroup()
172
- for name in delete:
173
- if name not in stored.entities:
174
- entity = stored.entities[name]
175
- entity.value = name
176
- else:
177
- entity = stored.entities[name]
178
- entity.deleted = True
179
- await self.store_entities_group(group, stored)
180
-
181
- async def delete_entities_group(self, group: str):
182
- await self.delete_stored_entities_group(group)
183
- await self.mark_entities_group_as_deleted(group)
184
-
185
79
  # Private API
186
80
 
187
- async def get_entities_group_inner(self, group: str) -> Optional[EntitiesGroup]:
188
- stored = await self.get_stored_entities_group(group)
189
- indexed = await self.get_indexed_entities_group(group)
190
- if stored is None and indexed is None:
191
- # Entity group does not exist
192
- return None
193
- elif stored is not None and indexed is not None:
194
- entities_group = self.merge_entities_groups(indexed, stored)
195
- else:
196
- entities_group = stored or indexed
197
- return entities_group
198
-
199
- async def get_stored_entities_group(self, group: str) -> Optional[EntitiesGroup]:
200
- return await datamanagers.entities.get_entities_group(self.txn, kbid=self.kbid, group=group)
81
+ async def get_entities_group_inner(self, group: str) -> EntitiesGroup | None:
82
+ return await self.get_indexed_entities_group(group)
201
83
 
202
- async def get_indexed_entities_group(self, group: str) -> Optional[EntitiesGroup]:
84
+ async def get_indexed_entities_group(self, group: str) -> EntitiesGroup | None:
203
85
  shard_manager = get_shard_manager()
204
86
 
205
87
  async def do_entities_search(shard_id: str) -> GraphSearchResponse:
@@ -228,26 +110,9 @@ class EntitiesManager:
228
110
  eg = EntitiesGroup(entities=entities)
229
111
  return eg
230
112
 
231
- async def get_deleted_entities_groups(self) -> set[str]:
232
- deleted: set[str] = set()
233
- key = KB_DELETED_ENTITIES_GROUPS.format(kbid=self.kbid)
234
- payload = await self.txn.get(key)
235
- if payload:
236
- deg = DeletedEntitiesGroups()
237
- deg.ParseFromString(payload)
238
- deleted.update(deg.entities_groups)
239
- return deleted
240
-
241
113
  async def entities_group_exists(self, group: str) -> bool:
242
- stored = await self.get_stored_entities_group(group)
243
- if stored is not None:
244
- return True
245
-
246
114
  indexed = await self.get_indexed_entities_group(group)
247
- if indexed is not None:
248
- return True
249
-
250
- return False
115
+ return indexed is not None
251
116
 
252
117
  async def iterate_entities_groups(
253
118
  self, exclude_deleted: bool
@@ -262,27 +127,10 @@ class EntitiesManager:
262
127
  self,
263
128
  exclude_deleted: bool,
264
129
  ) -> AsyncGenerator[str, None]:
265
- # Start the task to get indexed groups
266
- indexed_task = asyncio.create_task(self.get_indexed_entities_groups_names())
267
-
268
- if exclude_deleted:
269
- deleted_groups = await self.get_deleted_entities_groups()
270
-
271
130
  visited_groups = set()
272
-
273
- # stored groups
274
- entities_key = KB_ENTITIES.format(kbid=self.kbid)
275
- async for key in self.txn.keys(entities_key):
276
- group = key.split("/")[-1]
277
- if exclude_deleted and group in deleted_groups:
278
- continue
279
- yield group
280
- visited_groups.add(group)
281
-
282
- # indexed groups
283
- indexed_groups = await indexed_task
131
+ indexed_groups = await self.get_indexed_entities_groups_names()
284
132
  for group in indexed_groups:
285
- if (exclude_deleted and group in deleted_groups) or group in visited_groups:
133
+ if group in visited_groups:
286
134
  continue
287
135
  yield group
288
136
  visited_groups.add(group)
@@ -319,53 +167,6 @@ class EntitiesManager:
319
167
  return set()
320
168
  return set.union(*results)
321
169
 
322
- async def store_entities_group(self, group: str, eg: EntitiesGroup):
323
- meta_cache = await datamanagers.entities.get_entities_meta_cache(self.txn, kbid=self.kbid)
324
- duplicates = {}
325
- deleted = []
326
- duplicate_count = 0
327
- for entity in eg.entities.values():
328
- if entity.deleted:
329
- deleted.append(entity.value)
330
- continue
331
- if len(entity.represents) == 0:
332
- continue
333
- duplicates[entity.value] = list(entity.represents)
334
- duplicate_count += len(duplicates[entity.value])
335
-
336
- if duplicate_count > MAX_DUPLICATES:
337
- raise EntityManagementException(
338
- f"Too many duplicates: {duplicate_count}. Max of {MAX_DUPLICATES} currently allowed"
339
- )
340
- if len(deleted) > MAX_DELETED:
341
- raise EntityManagementException(
342
- f"Too many deleted entities: {len(deleted)}. Max of {MAX_DELETED} currently allowed"
343
- )
344
-
345
- meta_cache.set_duplicates(group, duplicates)
346
- meta_cache.set_deleted(group, deleted)
347
- await datamanagers.entities.set_entities_meta_cache(self.txn, kbid=self.kbid, cache=meta_cache)
348
-
349
- await datamanagers.entities.set_entities_group(
350
- self.txn, kbid=self.kbid, group_id=group, entities=eg
351
- )
352
- # if it was preivously deleted, we must unmark it
353
- await self.unmark_entities_group_as_deleted(group)
354
-
355
- async def is_entities_group_deleted(self, group: str):
356
- deleted_groups = await self.get_deleted_entities_groups()
357
- return group in deleted_groups
358
-
359
- async def delete_stored_entities_group(self, group: str):
360
- entities_key = KB_ENTITIES_GROUP.format(kbid=self.kbid, id=group)
361
- await self.txn.delete(entities_key)
362
-
363
- async def mark_entities_group_as_deleted(self, group: str):
364
- await datamanagers.entities.mark_group_as_deleted(self.txn, kbid=self.kbid, group=group)
365
-
366
- async def unmark_entities_group_as_deleted(self, group: str):
367
- await datamanagers.entities.unmark_group_as_deleted(self.txn, kbid=self.kbid, group=group)
368
-
369
170
  @staticmethod
370
171
  def merge_entities_groups(indexed: EntitiesGroup, stored: EntitiesGroup):
371
172
  """Create a new EntitiesGroup with the merged entities from `stored` and
@@ -20,7 +20,7 @@
20
20
 
21
21
 
22
22
  import asyncio
23
- from typing import Optional, Sequence
23
+ from collections.abc import Sequence
24
24
 
25
25
  from nidx_protos.noderesources_pb2 import Resource as IndexMessage
26
26
 
@@ -70,8 +70,8 @@ class IndexMessageBuilder:
70
70
  vectors: bool = True,
71
71
  relations: bool = True,
72
72
  replace: bool = True,
73
- vectorset_configs: Optional[list[VectorSetConfig]] = None,
74
- append_splits: Optional[set[str]] = None,
73
+ vectorset_configs: list[VectorSetConfig] | None = None,
74
+ append_splits: set[str] | None = None,
75
75
  ):
76
76
  field = await self.resource.get_field(fieldid.field, fieldid.field_type)
77
77
  extracted_text = await field.get_extracted_text()
@@ -281,7 +281,7 @@ class IndexMessageBuilder:
281
281
  vectorset_configs = [
282
282
  vectorset_config
283
283
  async for _, vectorset_config in datamanagers.vectorsets.iter(
284
- self.resource.txn, kbid=self.resource.kb.kbid
284
+ self.resource.txn, kbid=self.resource.kbid
285
285
  )
286
286
  ]
287
287
  return vectorset_configs