nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -18,16 +18,19 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import uuid
21
- from typing import Any, Optional
21
+ from typing import Any
22
22
 
23
23
  from nucliadb.ingest.fields.base import Field
24
- from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation
24
+ from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation, SplitsMetadata
25
25
  from nucliadb_protos.resources_pb2 import Conversation as PBConversation
26
26
  from nucliadb_utils.storages.storage import StorageField
27
27
 
28
+ MAX_CONVERSATION_MESSAGES = None # No limit
29
+
28
30
  PAGE_SIZE = 200
29
31
 
30
32
  CONVERSATION_PAGE_VALUE = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/{page}"
33
+ CONVERSATION_SPLITS_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/splits_metadata"
31
34
  CONVERSATION_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
32
35
 
33
36
 
@@ -39,7 +42,7 @@ class Conversation(Field[PBConversation]):
39
42
  pbklass = PBConversation
40
43
  type: str = "c"
41
44
  value: dict[int, PBConversation]
42
- metadata: Optional[FieldConversation]
45
+ metadata: FieldConversation | None
43
46
 
44
47
  _created: bool = False
45
48
 
@@ -47,20 +50,33 @@ class Conversation(Field[PBConversation]):
47
50
  self,
48
51
  id: str,
49
52
  resource: Any,
50
- pb: Optional[Any] = None,
51
- value: Optional[dict[int, PBConversation]] = None,
53
+ pb: Any | None = None,
54
+ value: dict[int, PBConversation] | None = None,
52
55
  ):
53
- super(Conversation, self).__init__(id, resource, pb, value)
56
+ super().__init__(id, resource, pb, value)
54
57
  self.value = {}
58
+ self._splits_metadata: SplitsMetadata | None = None
59
+ self.metadata = None
60
+
61
+ async def delete_value(self):
62
+ await self.resource.txn.delete_by_prefix(
63
+ CONVERSATION_METADATA.format(kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id)
64
+ )
65
+ self._split_metadata = None
55
66
  self.metadata = None
67
+ self.value.clear()
56
68
 
57
69
  async def set_value(self, payload: PBConversation):
70
+ if payload.replace_field:
71
+ # As we need to overwrite the value of the conversation, first delete any previous data.
72
+ await self.delete_value()
73
+
58
74
  metadata = await self.get_metadata()
59
75
  metadata.extract_strategy = payload.extract_strategy
60
76
  metadata.split_strategy = payload.split_strategy
61
77
 
62
78
  # Get the last page if it exists
63
- last_page: Optional[PBConversation] = None
79
+ last_page: PBConversation | None = None
64
80
  if self._created is False and metadata.pages > 0:
65
81
  try:
66
82
  last_page = await self.db_get_value(page=metadata.pages)
@@ -70,10 +86,13 @@ class Conversation(Field[PBConversation]):
70
86
  last_page = PBConversation()
71
87
  metadata.pages += 1
72
88
 
89
+ self._splits_metadata = await self.get_splits_metadata()
90
+
73
91
  # Make sure message attachment files are on our region. This is needed
74
92
  # to support the hybrid-onprem deployment as the attachments must be stored
75
93
  # at the storage services of the client's premises.
76
94
  for message in payload.messages:
95
+ self._splits_metadata.metadata.get_or_create(message.ident)
77
96
  new_message_files = []
78
97
  for idx, file in enumerate(message.content.attachments):
79
98
  if self.storage.needs_move(file, self.kbid):
@@ -117,8 +136,9 @@ class Conversation(Field[PBConversation]):
117
136
 
118
137
  # Finally, set the metadata
119
138
  await self.db_set_metadata(metadata)
139
+ await self.set_splits_metadata(self._splits_metadata)
120
140
 
121
- async def get_value(self, page: Optional[int] = None) -> Optional[PBConversation]:
141
+ async def get_value(self, page: int | None = None) -> PBConversation | None:
122
142
  # If no page was requested, force fetch of metadata
123
143
  # and set the page to the last page
124
144
  if page is None and self.metadata is None:
@@ -133,7 +153,7 @@ class Conversation(Field[PBConversation]):
133
153
  except PageNotFound:
134
154
  return None
135
155
 
136
- async def get_full_conversation(self) -> Optional[PBConversation]:
156
+ async def get_full_conversation(self) -> PBConversation | None:
137
157
  """
138
158
  Messages of a conversations may be stored across several pages.
139
159
  This method fetches them all and returns a single complete conversation.
@@ -203,3 +223,29 @@ class Conversation(Field[PBConversation]):
203
223
  self.metadata = payload
204
224
  self.resource.modified = True
205
225
  self._created = False
226
+
227
+ async def get_splits_metadata(self) -> SplitsMetadata:
228
+ if self._splits_metadata is None:
229
+ field_key = CONVERSATION_SPLITS_METADATA.format(
230
+ kbid=self.kbid,
231
+ uuid=self.uuid,
232
+ type=self.type,
233
+ field=self.id,
234
+ )
235
+ payload = await self.resource.txn.get(field_key)
236
+ if payload is None:
237
+ return SplitsMetadata()
238
+ self._splits_metadata = SplitsMetadata()
239
+ self._splits_metadata.ParseFromString(payload)
240
+ return self._splits_metadata
241
+
242
+ async def set_splits_metadata(self, payload: SplitsMetadata) -> None:
243
+ key = CONVERSATION_SPLITS_METADATA.format(
244
+ kbid=self.kbid,
245
+ uuid=self.uuid,
246
+ type=self.type,
247
+ field=self.id,
248
+ )
249
+ await self.resource.txn.set(key, payload.SerializeToString())
250
+ self._split_metadata = payload
251
+ self.resource.modified = True
@@ -17,7 +17,6 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Type
21
20
 
22
21
 
23
22
  class InvalidFieldClass(Exception):
@@ -25,7 +24,7 @@ class InvalidFieldClass(Exception):
25
24
 
26
25
 
27
26
  class InvalidPBClass(Exception):
28
- def __init__(self, source: Type, destination: Type):
27
+ def __init__(self, source: type, destination: type):
29
28
  self.source = source
30
29
  self.destination = destination
31
30
  super().__init__(f"Source and destination does not match {self.source} - {self.destination}")
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Any, Optional
20
+ from typing import Any
21
21
 
22
22
  from nucliadb.ingest.fields.base import Field
23
23
  from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FileExtractedData
@@ -30,22 +30,22 @@ class File(Field[FieldFile]):
30
30
  pbklass = FieldFile
31
31
  value: FieldFile
32
32
  type: str = "f"
33
- file_extracted_data: Optional[FileExtractedData]
33
+ file_extracted_data: FileExtractedData | None
34
34
 
35
35
  def __init__(
36
36
  self,
37
37
  id: str,
38
38
  resource: Any,
39
- pb: Optional[Any] = None,
40
- value: Optional[str] = None,
39
+ pb: Any | None = None,
40
+ value: str | None = None,
41
41
  ):
42
- super(File, self).__init__(id, resource, pb, value)
42
+ super().__init__(id, resource, pb, value)
43
43
  self.file_extracted_data = None
44
44
 
45
45
  async def set_value(self, payload: FieldFile):
46
46
  old_file = await self.get_value()
47
47
  if old_file is None:
48
- old_cf: Optional[CloudFile] = None
48
+ old_cf: CloudFile | None = None
49
49
  else:
50
50
  old_cf = old_file.file
51
51
 
@@ -57,7 +57,7 @@ class File(Field[FieldFile]):
57
57
 
58
58
  await self.db_set_value(payload)
59
59
 
60
- async def get_value(self) -> Optional[FieldFile]:
60
+ async def get_value(self) -> FieldFile | None:
61
61
  return await self.db_get_value()
62
62
 
63
63
  async def set_file_extracted_data(self, file_extracted_data: FileExtractedData):
@@ -101,10 +101,24 @@ class File(Field[FieldFile]):
101
101
  await self.storage.upload_pb(sf, file_extracted_data)
102
102
  self.file_extracted_data = file_extracted_data
103
103
 
104
- async def get_file_extracted_data(self) -> Optional[FileExtractedData]:
104
+ async def get_file_extracted_data(self) -> FileExtractedData | None:
105
105
  if self.file_extracted_data is None:
106
106
  sf: StorageField = self.storage.file_extracted(
107
107
  self.kbid, self.uuid, self.type, self.id, FILE_METADATA
108
108
  )
109
109
  self.file_extracted_data = await self.storage.download_pb(sf, FileExtractedData)
110
110
  return self.file_extracted_data
111
+
112
+ async def thumbnail(self) -> StorageField | None:
113
+ """Access the file field thumbnail."""
114
+ fed = await self.get_file_extracted_data()
115
+ if fed is None:
116
+ return None
117
+ if not fed.HasField("file_thumbnail"):
118
+ return None
119
+
120
+ sf: StorageField = self.storage.file_extracted(
121
+ self.kbid, self.uuid, self.type, self.id, "file_thumbnail"
122
+ )
123
+ sf.field = fed.file_thumbnail
124
+ return sf
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Any, Optional
20
+ from typing import Any
21
21
 
22
22
  from nucliadb.ingest.fields.base import Field
23
23
  from nucliadb_protos.resources_pb2 import CloudFile, FieldLink, LinkExtractedData
@@ -30,22 +30,22 @@ class Link(Field[FieldLink]):
30
30
  pbklass = FieldLink
31
31
  value: FieldLink
32
32
  type: str = "u"
33
- link_extracted_data: Optional[LinkExtractedData]
33
+ link_extracted_data: LinkExtractedData | None
34
34
 
35
35
  def __init__(
36
36
  self,
37
37
  id: str,
38
38
  resource: Any,
39
- pb: Optional[Any] = None,
40
- value: Optional[str] = None,
39
+ pb: Any | None = None,
40
+ value: str | None = None,
41
41
  ):
42
- super(Link, self).__init__(id, resource, pb, value)
42
+ super().__init__(id, resource, pb, value)
43
43
  self.link_extracted_data = None
44
44
 
45
45
  async def set_value(self, payload: FieldLink):
46
46
  await self.db_set_value(payload)
47
47
 
48
- async def get_value(self) -> Optional[FieldLink]:
48
+ async def get_value(self) -> FieldLink | None:
49
49
  return await self.db_get_value()
50
50
 
51
51
  async def set_link_extracted_data(self, link_extracted_data: LinkExtractedData):
@@ -88,7 +88,7 @@ class Link(Field[FieldLink]):
88
88
  await self.storage.upload_pb(sf, link_extracted_data)
89
89
  self.link_extracted_data = link_extracted_data
90
90
 
91
- async def get_link_extracted_data(self) -> Optional[LinkExtractedData]:
91
+ async def get_link_extracted_data(self) -> LinkExtractedData | None:
92
92
  if self.link_extracted_data is None:
93
93
  sf: StorageField = self.storage.file_extracted(
94
94
  self.kbid, self.uuid, self.type, self.id, LINK_METADATA
@@ -19,7 +19,6 @@
19
19
  #
20
20
 
21
21
  import hashlib
22
- from typing import Optional
23
22
 
24
23
  from nucliadb.ingest.fields.base import Field
25
24
  from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
@@ -39,8 +38,8 @@ class Text(Field[FieldText]):
39
38
 
40
39
  async def set_value(self, payload: FieldText):
41
40
  if payload.md5 == "":
42
- payload.md5 = hashlib.md5(payload.body.encode()).hexdigest()
41
+ payload.md5 = hashlib.md5(payload.body.encode(), usedforsecurity=False).hexdigest()
43
42
  await self.db_set_value(payload)
44
43
 
45
- async def get_value(self) -> Optional[FieldText]:
44
+ async def get_value(self) -> FieldText | None:
46
45
  return await self.db_get_value()
@@ -18,9 +18,9 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import logging
21
+ from collections.abc import Iterator
21
22
  from copy import deepcopy
22
23
  from dataclasses import dataclass
23
- from typing import Optional
24
24
 
25
25
  from nidx_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
26
26
  from nidx_protos.noderesources_pb2 import (
@@ -81,9 +81,9 @@ class ResourceBrain:
81
81
  self,
82
82
  basic: Basic,
83
83
  user_relations: Relations,
84
- origin: Optional[Origin],
85
- previous_processing_status: Optional[Metadata.Status.ValueType],
86
- security: Optional[utils_pb2.Security],
84
+ origin: Origin | None,
85
+ previous_processing_status: Metadata.Status.ValueType | None,
86
+ security: utils_pb2.Security | None,
87
87
  ) -> None:
88
88
  self._set_resource_status(basic, previous_processing_status)
89
89
  self._set_resource_dates(basic, origin)
@@ -97,9 +97,9 @@ class ResourceBrain:
97
97
  self,
98
98
  field_key: str,
99
99
  extracted_text: ExtractedText,
100
- field_computed_metadata: Optional[FieldComputedMetadata],
101
- basic_user_metadata: Optional[UserMetadata],
102
- field_author: Optional[FieldAuthor],
100
+ field_computed_metadata: FieldComputedMetadata | None,
101
+ basic_user_metadata: UserMetadata | None,
102
+ field_author: FieldAuthor | None,
103
103
  replace_field: bool,
104
104
  skip_index: bool,
105
105
  ) -> None:
@@ -122,13 +122,17 @@ class ResourceBrain:
122
122
  field_key: str,
123
123
  extracted_text: ExtractedText,
124
124
  replace_field: bool,
125
- skip_texts: Optional[bool],
125
+ skip_texts: bool | None,
126
126
  ):
127
127
  if skip_texts is not None:
128
128
  self.brain.skip_texts = skip_texts
129
+
129
130
  field_text = extracted_text.text
130
- for _, split in extracted_text.split_text.items():
131
- field_text += f" {split} "
131
+
132
+ for split_id in self.sorted_splits(extracted_text):
133
+ split_text = extracted_text.split_text[split_id]
134
+ field_text += f"{split_text} "
135
+
132
136
  self.brain.texts[field_key].text = field_text
133
137
 
134
138
  if replace_field:
@@ -140,18 +144,16 @@ class ResourceBrain:
140
144
  def apply_field_labels(
141
145
  self,
142
146
  field_key: str,
143
- field_computed_metadata: Optional[FieldComputedMetadata],
144
- field_author: Optional[FieldAuthor],
145
- basic_user_metadata: Optional[UserMetadata] = None,
147
+ field_computed_metadata: FieldComputedMetadata | None,
148
+ field_author: FieldAuthor | None,
149
+ basic_user_metadata: UserMetadata | None = None,
146
150
  ):
147
151
  user_cancelled_labels: set[str] = (
148
- set(
149
- [
150
- f"{classification.labelset}/{classification.label}"
151
- for classification in basic_user_metadata.classifications
152
- if classification.cancelled_by_user
153
- ]
154
- )
152
+ {
153
+ f"{classification.labelset}/{classification.label}"
154
+ for classification in basic_user_metadata.classifications
155
+ if classification.cancelled_by_user
156
+ }
155
157
  if basic_user_metadata
156
158
  else set()
157
159
  )
@@ -193,7 +195,7 @@ class ResourceBrain:
193
195
  if field_author is not None and field_author.WhichOneof("author") == "data_augmentation":
194
196
  field_type, field_id = field_key.split("/")
195
197
  da_task_id = ids.extract_data_augmentation_id(field_id)
196
- if da_task_id is None: # pragma: nocover
198
+ if da_task_id is None: # pragma: no cover
197
199
  logger.warning(
198
200
  "Data augmentation field id has an unexpected format! Skipping label",
199
201
  extra={
@@ -212,12 +214,17 @@ class ResourceBrain:
212
214
  field_key: str,
213
215
  field_computed_metadata: FieldComputedMetadata,
214
216
  extracted_text: ExtractedText,
215
- page_positions: Optional[FilePagePositions],
216
- user_field_metadata: Optional[UserFieldMetadata],
217
+ page_positions: FilePagePositions | None,
218
+ user_field_metadata: UserFieldMetadata | None,
217
219
  replace_field: bool,
218
- skip_paragraphs_index: Optional[bool],
219
- skip_texts_index: Optional[bool],
220
+ skip_paragraphs_index: bool | None,
221
+ skip_texts_index: bool | None,
222
+ append_splits: set[str] | None = None,
220
223
  ) -> None:
224
+ """
225
+ append_splits: when provided, only the splits in this set will be indexed. This is used for conversation appends, to
226
+ avoid reindexing all previous messages of the conversation.
227
+ """
221
228
  # We need to add the extracted text to the texts section of the Resource so that
222
229
  # the paragraphs can be indexed
223
230
  self.apply_field_text(
@@ -234,27 +241,45 @@ class ResourceBrain:
234
241
  user_field_metadata,
235
242
  replace_field=replace_field,
236
243
  skip_paragraphs=skip_paragraphs_index,
244
+ append_splits=append_splits,
237
245
  )
238
246
 
247
+ def sorted_splits(self, extracted_text: ExtractedText) -> Iterator[str]:
248
+ yield from sorted(extracted_text.split_text.keys())
249
+
239
250
  @observer.wrap({"type": "apply_field_paragraphs"})
240
251
  def apply_field_paragraphs(
241
252
  self,
242
253
  field_key: str,
243
254
  field_computed_metadata: FieldComputedMetadata,
244
255
  extracted_text: ExtractedText,
245
- page_positions: Optional[FilePagePositions],
246
- user_field_metadata: Optional[UserFieldMetadata],
256
+ page_positions: FilePagePositions | None,
257
+ user_field_metadata: UserFieldMetadata | None,
247
258
  replace_field: bool,
248
- skip_paragraphs: Optional[bool],
259
+ skip_paragraphs: bool | None,
260
+ append_splits: set[str] | None = None,
249
261
  ) -> None:
250
262
  if skip_paragraphs is not None:
251
263
  self.brain.skip_paragraphs = skip_paragraphs
252
264
  unique_paragraphs: set[str] = set()
253
265
  user_paragraph_classifications = self._get_paragraph_user_classifications(user_field_metadata)
254
266
  paragraph_pages = ParagraphPages(page_positions) if page_positions else None
267
+
255
268
  # Splits of the field
256
- for subfield, field_metadata in field_computed_metadata.split_metadata.items():
257
- extracted_text_str = extracted_text.split_text[subfield] if extracted_text else None
269
+
270
+ # Used to adjust the paragraph start/end when indexing splits, as they are all
271
+ # concatenated in the main text part of the brain Resource.
272
+ split_offset = 0
273
+ for subfield in self.sorted_splits(extracted_text):
274
+ if subfield not in field_computed_metadata.split_metadata or should_skip_split_indexing(
275
+ subfield, replace_field, append_splits
276
+ ):
277
+ # We're skipping this split but we need to adjust the offset as we have added the text
278
+ # of this split to the main text
279
+ split_offset += len(extracted_text.split_text[subfield]) + 1 # +1 for the space
280
+ continue
281
+ field_metadata = field_computed_metadata.split_metadata[subfield]
282
+ extracted_text_str = extracted_text.split_text[subfield]
258
283
  for idx, paragraph in enumerate(field_metadata.paragraphs):
259
284
  key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
260
285
  denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
@@ -280,8 +305,8 @@ class ResourceBrain:
280
305
  representation.file = paragraph.representation.reference_file
281
306
  representation.is_a_table = paragraph.representation.is_a_table
282
307
  p = BrainParagraph(
283
- start=paragraph.start,
284
- end=paragraph.end,
308
+ start=paragraph.start + split_offset,
309
+ end=paragraph.end + split_offset,
285
310
  field=field_key,
286
311
  split=subfield,
287
312
  index=idx,
@@ -296,6 +321,7 @@ class ResourceBrain:
296
321
  representation=representation,
297
322
  ),
298
323
  )
324
+ split_offset = p.end + 1 # +1 for the space
299
325
  paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
300
326
  paragraph_labels = {paragraph_kind_label}
301
327
  paragraph_labels.update(
@@ -308,7 +334,7 @@ class ResourceBrain:
308
334
  self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
309
335
 
310
336
  # Main field
311
- extracted_text_str = extracted_text.text if extracted_text else None
337
+ extracted_text_str = extracted_text.text
312
338
  for idx, paragraph in enumerate(field_computed_metadata.metadata.paragraphs):
313
339
  key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
314
340
  denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
@@ -364,7 +390,7 @@ class ResourceBrain:
364
390
  self.brain.paragraphs_to_delete.append(full_field_id)
365
391
 
366
392
  def _get_paragraph_user_classifications(
367
- self, basic_user_field_metadata: Optional[UserFieldMetadata]
393
+ self, basic_user_field_metadata: UserFieldMetadata | None
368
394
  ) -> ParagraphClassifications:
369
395
  pc = ParagraphClassifications(valid={}, denied={})
370
396
  if basic_user_field_metadata is None:
@@ -383,18 +409,16 @@ class ResourceBrain:
383
409
  def generate_relations(
384
410
  self,
385
411
  field_key: str,
386
- field_computed_metadata: Optional[FieldComputedMetadata],
387
- basic_user_metadata: Optional[UserMetadata],
412
+ field_computed_metadata: FieldComputedMetadata | None,
413
+ basic_user_metadata: UserMetadata | None,
388
414
  replace_field: bool,
389
415
  ) -> None:
390
416
  user_cancelled_labels: set[str] = (
391
- set(
392
- [
393
- f"{classification.labelset}/{classification.label}"
394
- for classification in basic_user_metadata.classifications
395
- if classification.cancelled_by_user
396
- ]
397
- )
417
+ {
418
+ f"{classification.labelset}/{classification.label}"
419
+ for classification in basic_user_metadata.classifications
420
+ if classification.cancelled_by_user
421
+ }
398
422
  if basic_user_metadata
399
423
  else set()
400
424
  )
@@ -483,7 +507,7 @@ class ResourceBrain:
483
507
  full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
484
508
  self.brain.texts_to_delete.append(full_field_id)
485
509
  self.brain.paragraphs_to_delete.append(full_field_id)
486
- self.brain.sentences_to_delete.append(full_field_id)
510
+ self.brain.vectors_to_delete_in_all_vectorsets.append(full_field_id)
487
511
  self.brain.relation_fields_to_delete.append(field_key)
488
512
 
489
513
  @observer.wrap({"type": "generate_vectors"})
@@ -495,10 +519,13 @@ class ResourceBrain:
495
519
  vectorset: str,
496
520
  replace_field: bool = False,
497
521
  # cut to specific dimension if specified
498
- vector_dimension: Optional[int] = None,
522
+ vector_dimension: int | None = None,
523
+ append_splits: set[str] | None = None,
499
524
  ):
500
525
  fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
501
526
  for subfield, vectors in vo.split_vectors.items():
527
+ if should_skip_split_indexing(subfield, replace_field, append_splits):
528
+ continue
502
529
  _field_id = ids.FieldId(
503
530
  rid=fid.rid,
504
531
  type=fid.type,
@@ -567,7 +594,7 @@ class ResourceBrain:
567
594
  *,
568
595
  vectorset: str,
569
596
  # cut vectors if a specific dimension is specified
570
- vector_dimension: Optional[int] = None,
597
+ vector_dimension: int | None = None,
571
598
  ):
572
599
  paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
573
600
  sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
@@ -592,7 +619,7 @@ class ResourceBrain:
592
619
 
593
620
  sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
594
621
 
595
- def _set_resource_status(self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]):
622
+ def _set_resource_status(self, basic: Basic, previous_status: Metadata.Status.ValueType | None):
596
623
  """
597
624
  We purposefully overwrite what we index as a status and DO NOT reflect
598
625
  actual status with what we index.
@@ -622,32 +649,32 @@ class ResourceBrain:
622
649
  return "EMPTY"
623
650
  return METADATA_STATUS_PB_TYPE_TO_NAME_MAP[metadata.status]
624
651
 
625
- def _set_resource_dates(self, basic: Basic, origin: Optional[Origin]):
652
+ def _set_resource_dates(self, basic: Basic, origin: Origin | None):
626
653
  """
627
654
  Adds the user-defined dates to the brain object. This is at resource level and applies to
628
655
  all fields of the resource.
629
656
  """
630
- if basic.created.seconds > 0:
657
+ if basic.created.seconds != 0:
631
658
  self.brain.metadata.created.CopyFrom(basic.created)
632
659
  else:
633
660
  logging.warning(f"Basic metadata has no created field for {self.rid}")
634
661
  self.brain.metadata.created.GetCurrentTime()
635
- if basic.modified.seconds > 0:
662
+ if basic.modified.seconds != 0:
636
663
  self.brain.metadata.modified.CopyFrom(basic.modified)
637
664
  else:
638
- if basic.created.seconds > 0:
665
+ if basic.created.seconds != 0:
639
666
  self.brain.metadata.modified.CopyFrom(basic.created)
640
667
  else:
641
668
  self.brain.metadata.modified.GetCurrentTime()
642
669
 
643
670
  if origin is not None:
644
671
  # overwrite created/modified if provided on origin
645
- if origin.HasField("created") and origin.created.seconds > 0:
672
+ if origin.HasField("created") and origin.created.seconds != 0:
646
673
  self.brain.metadata.created.CopyFrom(origin.created)
647
- if origin.HasField("modified") and origin.modified.seconds > 0:
674
+ if origin.HasField("modified") and origin.modified.seconds != 0:
648
675
  self.brain.metadata.modified.CopyFrom(origin.modified)
649
676
 
650
- def _set_resource_relations(self, basic: Basic, origin: Optional[Origin], user_relations: Relations):
677
+ def _set_resource_relations(self, basic: Basic, origin: Origin | None, user_relations: Relations):
651
678
  """
652
679
  Adds the relations to the brain object corresponding to the user-defined metadata at the resource level:
653
680
  - Contributors of the document
@@ -691,7 +718,7 @@ class ResourceBrain:
691
718
 
692
719
  self.brain.relation_fields_to_delete.append("a/metadata")
693
720
 
694
- def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
721
+ def _set_resource_labels(self, basic: Basic, origin: Origin | None):
695
722
  """
696
723
  Adds the resource-level labels to the brain object.
697
724
  These levels are user-defined in basic or origin metadata.
@@ -748,7 +775,7 @@ class ResourceBrain:
748
775
 
749
776
  def is_paragraph_repeated_in_field(
750
777
  paragraph: Paragraph,
751
- extracted_text: Optional[str],
778
+ extracted_text: str | None,
752
779
  unique_paragraphs: set[str],
753
780
  ) -> bool:
754
781
  if extracted_text is None:
@@ -787,8 +814,13 @@ class ParagraphPages:
787
814
  return self._materialized[paragraph_start_index]
788
815
  except IndexError:
789
816
  logger.error(
790
- f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}" # noqa
817
+ f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}"
791
818
  )
792
819
  if len(self._materialized) > 0:
793
820
  return self._materialized[-1]
794
821
  return 0
822
+
823
+
824
+ def should_skip_split_indexing(split: str, replace_field: bool, append_splits: set[str] | None) -> bool:
825
+ # When replacing the whole field, reindex all splits. Otherwise, we're only indexing the splits that are appended
826
+ return not replace_field and append_splits is not None and split not in append_splits
@@ -56,7 +56,7 @@ class _BrokerMessageBuilder:
56
56
  # clear the state and generate a new broker message
57
57
  self.bm.Clear()
58
58
 
59
- self.bm.kbid = resource.kb.kbid
59
+ self.bm.kbid = resource.kbid
60
60
  self.bm.uuid = resource.uuid
61
61
  basic = await resource.get_basic()
62
62
  if basic is not None:
@@ -93,9 +93,7 @@ class _BrokerMessageBuilder:
93
93
  self.bm.link_extracted_data.append(link_extracted_data)
94
94
 
95
95
  # Field vectors
96
- async for vectorset_id, vs in datamanagers.vectorsets.iter(
97
- resource.txn, kbid=resource.kb.kbid
98
- ):
96
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(resource.txn, kbid=resource.kbid):
99
97
  await self.generate_field_vectors(
100
98
  type_id, field_id, field, vectorset_id, vs.storage_key_kind
101
99
  )