nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -18,8 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- import asyncio
22
- from typing import AsyncGenerator, Optional
21
+ from collections.abc import AsyncGenerator
23
22
 
24
23
  from nidx_protos.nodereader_pb2 import (
25
24
  Faceted,
@@ -29,23 +28,12 @@ from nidx_protos.nodereader_pb2 import (
29
28
  SearchResponse,
30
29
  )
31
30
 
32
- from nucliadb.common import datamanagers
33
- from nucliadb.common.cluster.exceptions import (
34
- AlreadyExists,
35
- EntitiesGroupNotFound,
36
- )
37
31
  from nucliadb.common.cluster.utils import get_shard_manager
38
- from nucliadb.common.datamanagers.entities import (
39
- KB_DELETED_ENTITIES_GROUPS,
40
- KB_ENTITIES,
41
- KB_ENTITIES_GROUP,
42
- )
43
32
  from nucliadb.common.maindb.driver import Transaction
44
33
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
45
34
  from nucliadb.ingest.settings import settings
46
35
  from nucliadb.search.search.shards import graph_search_shard, query_shard
47
36
  from nucliadb_protos.knowledgebox_pb2 import (
48
- DeletedEntitiesGroups,
49
37
  EntitiesGroup,
50
38
  EntitiesGroupSummary,
51
39
  Entity,
@@ -53,8 +41,6 @@ from nucliadb_protos.knowledgebox_pb2 import (
53
41
  from nucliadb_protos.utils_pb2 import RelationNode
54
42
  from nucliadb_protos.writer_pb2 import GetEntitiesResponse
55
43
 
56
- from .exceptions import EntityManagementException
57
-
58
44
  MAX_DUPLICATES = 300
59
45
  MAX_DELETED = 300
60
46
 
@@ -69,20 +55,11 @@ class EntitiesManager:
69
55
  self.txn = txn
70
56
  self.kbid = self.kb.kbid
71
57
 
72
- async def create_entities_group(self, group: str, entities: EntitiesGroup):
73
- if await self.entities_group_exists(group):
74
- raise AlreadyExists(f"Entities group {group} already exists")
75
-
76
- await self.store_entities_group(group, entities)
77
-
78
58
  async def get_entities(self, entities: GetEntitiesResponse):
79
59
  async for group, eg in self.iterate_entities_groups(exclude_deleted=True):
80
60
  entities.groups[group].CopyFrom(eg)
81
61
 
82
- async def get_entities_group(self, group: str) -> Optional[EntitiesGroup]:
83
- deleted = await self.is_entities_group_deleted(group)
84
- if deleted:
85
- return None
62
+ async def get_entities_group(self, group: str) -> EntitiesGroup | None:
86
63
  return await self.get_entities_group_inner(group)
87
64
 
88
65
  async def get_entities_groups(self) -> dict[str, EntitiesGroup]:
@@ -93,113 +70,18 @@ class EntitiesManager:
93
70
 
94
71
  async def list_entities_groups(self) -> dict[str, EntitiesGroupSummary]:
95
72
  groups = {}
96
- max_simultaneous = asyncio.Semaphore(10)
97
73
 
98
- async def _composition(group: str):
99
- async with max_simultaneous:
100
- stored = await self.get_stored_entities_group(group)
101
- if stored is not None:
102
- groups[group] = EntitiesGroupSummary(
103
- title=stored.title, color=stored.color, custom=stored.custom
104
- )
105
- else:
106
- # We don't want to search for each indexed group, as we are
107
- # providing a quick summary
108
- groups[group] = EntitiesGroupSummary()
74
+ async for group in self.iterate_entities_groups_names(exclude_deleted=True):
75
+ groups[group] = EntitiesGroupSummary()
109
76
 
110
- tasks = [
111
- asyncio.create_task(_composition(group))
112
- async for group in self.iterate_entities_groups_names(exclude_deleted=True)
113
- ]
114
- if tasks:
115
- await asyncio.wait(tasks)
116
77
  return groups
117
78
 
118
- async def update_entities(self, group: str, entities: dict[str, Entity]):
119
- """Update entities on an entity group. New entities are appended and existing
120
- are overwriten. Existing entities not appearing in `entities` are left
121
- intact. Use `delete_entities` to delete them instead.
122
-
123
- """
124
- if not await self.entities_group_exists(group):
125
- raise EntitiesGroupNotFound(f"Entities group '{group}' doesn't exist")
126
-
127
- entities_group = await self.get_stored_entities_group(group)
128
- if entities_group is None:
129
- entities_group = EntitiesGroup()
130
-
131
- for name, entity in entities.items():
132
- entities_group.entities[name].CopyFrom(entity)
133
-
134
- await self.store_entities_group(group, entities_group)
135
-
136
- async def set_entities_group(self, group: str, entities: EntitiesGroup):
137
- indexed = await self.get_indexed_entities_group(group)
138
- if indexed is None:
139
- updated = entities
140
- else:
141
- updated = EntitiesGroup()
142
- updated.CopyFrom(entities)
143
-
144
- for name, entity in indexed.entities.items():
145
- if name not in updated.entities:
146
- updated.entities[name].CopyFrom(entity)
147
- updated.entities[name].deleted = True
148
-
149
- await self.store_entities_group(group, updated)
150
-
151
- async def set_entities_group_force(self, group: str, entitiesgroup: EntitiesGroup):
152
- await self.store_entities_group(group, entitiesgroup)
153
-
154
- async def set_entities_group_metadata(
155
- self, group: str, *, title: Optional[str] = None, color: Optional[str] = None
156
- ):
157
- entities_group = await self.get_stored_entities_group(group)
158
- if entities_group is None:
159
- entities_group = EntitiesGroup()
160
-
161
- if title:
162
- entities_group.title = title
163
- if color:
164
- entities_group.color = color
165
-
166
- await self.store_entities_group(group, entities_group)
167
-
168
- async def delete_entities(self, group: str, delete: list[str]):
169
- stored = await self.get_stored_entities_group(group)
170
-
171
- stored = stored or EntitiesGroup()
172
- for name in delete:
173
- if name not in stored.entities:
174
- entity = stored.entities[name]
175
- entity.value = name
176
- else:
177
- entity = stored.entities[name]
178
- entity.deleted = True
179
- await self.store_entities_group(group, stored)
180
-
181
- async def delete_entities_group(self, group: str):
182
- await self.delete_stored_entities_group(group)
183
- await self.mark_entities_group_as_deleted(group)
184
-
185
79
  # Private API
186
80
 
187
- async def get_entities_group_inner(self, group: str) -> Optional[EntitiesGroup]:
188
- stored = await self.get_stored_entities_group(group)
189
- indexed = await self.get_indexed_entities_group(group)
190
- if stored is None and indexed is None:
191
- # Entity group does not exist
192
- return None
193
- elif stored is not None and indexed is not None:
194
- entities_group = self.merge_entities_groups(indexed, stored)
195
- else:
196
- entities_group = stored or indexed
197
- return entities_group
198
-
199
- async def get_stored_entities_group(self, group: str) -> Optional[EntitiesGroup]:
200
- return await datamanagers.entities.get_entities_group(self.txn, kbid=self.kbid, group=group)
81
+ async def get_entities_group_inner(self, group: str) -> EntitiesGroup | None:
82
+ return await self.get_indexed_entities_group(group)
201
83
 
202
- async def get_indexed_entities_group(self, group: str) -> Optional[EntitiesGroup]:
84
+ async def get_indexed_entities_group(self, group: str) -> EntitiesGroup | None:
203
85
  shard_manager = get_shard_manager()
204
86
 
205
87
  async def do_entities_search(shard_id: str) -> GraphSearchResponse:
@@ -228,26 +110,9 @@ class EntitiesManager:
228
110
  eg = EntitiesGroup(entities=entities)
229
111
  return eg
230
112
 
231
- async def get_deleted_entities_groups(self) -> set[str]:
232
- deleted: set[str] = set()
233
- key = KB_DELETED_ENTITIES_GROUPS.format(kbid=self.kbid)
234
- payload = await self.txn.get(key)
235
- if payload:
236
- deg = DeletedEntitiesGroups()
237
- deg.ParseFromString(payload)
238
- deleted.update(deg.entities_groups)
239
- return deleted
240
-
241
113
  async def entities_group_exists(self, group: str) -> bool:
242
- stored = await self.get_stored_entities_group(group)
243
- if stored is not None:
244
- return True
245
-
246
114
  indexed = await self.get_indexed_entities_group(group)
247
- if indexed is not None:
248
- return True
249
-
250
- return False
115
+ return indexed is not None
251
116
 
252
117
  async def iterate_entities_groups(
253
118
  self, exclude_deleted: bool
@@ -262,27 +127,10 @@ class EntitiesManager:
262
127
  self,
263
128
  exclude_deleted: bool,
264
129
  ) -> AsyncGenerator[str, None]:
265
- # Start the task to get indexed groups
266
- indexed_task = asyncio.create_task(self.get_indexed_entities_groups_names())
267
-
268
- if exclude_deleted:
269
- deleted_groups = await self.get_deleted_entities_groups()
270
-
271
130
  visited_groups = set()
272
-
273
- # stored groups
274
- entities_key = KB_ENTITIES.format(kbid=self.kbid)
275
- async for key in self.txn.keys(entities_key):
276
- group = key.split("/")[-1]
277
- if exclude_deleted and group in deleted_groups:
278
- continue
279
- yield group
280
- visited_groups.add(group)
281
-
282
- # indexed groups
283
- indexed_groups = await indexed_task
131
+ indexed_groups = await self.get_indexed_entities_groups_names()
284
132
  for group in indexed_groups:
285
- if (exclude_deleted and group in deleted_groups) or group in visited_groups:
133
+ if group in visited_groups:
286
134
  continue
287
135
  yield group
288
136
  visited_groups.add(group)
@@ -319,53 +167,6 @@ class EntitiesManager:
319
167
  return set()
320
168
  return set.union(*results)
321
169
 
322
- async def store_entities_group(self, group: str, eg: EntitiesGroup):
323
- meta_cache = await datamanagers.entities.get_entities_meta_cache(self.txn, kbid=self.kbid)
324
- duplicates = {}
325
- deleted = []
326
- duplicate_count = 0
327
- for entity in eg.entities.values():
328
- if entity.deleted:
329
- deleted.append(entity.value)
330
- continue
331
- if len(entity.represents) == 0:
332
- continue
333
- duplicates[entity.value] = list(entity.represents)
334
- duplicate_count += len(duplicates[entity.value])
335
-
336
- if duplicate_count > MAX_DUPLICATES:
337
- raise EntityManagementException(
338
- f"Too many duplicates: {duplicate_count}. Max of {MAX_DUPLICATES} currently allowed"
339
- )
340
- if len(deleted) > MAX_DELETED:
341
- raise EntityManagementException(
342
- f"Too many deleted entities: {len(deleted)}. Max of {MAX_DELETED} currently allowed"
343
- )
344
-
345
- meta_cache.set_duplicates(group, duplicates)
346
- meta_cache.set_deleted(group, deleted)
347
- await datamanagers.entities.set_entities_meta_cache(self.txn, kbid=self.kbid, cache=meta_cache)
348
-
349
- await datamanagers.entities.set_entities_group(
350
- self.txn, kbid=self.kbid, group_id=group, entities=eg
351
- )
352
- # if it was preivously deleted, we must unmark it
353
- await self.unmark_entities_group_as_deleted(group)
354
-
355
- async def is_entities_group_deleted(self, group: str):
356
- deleted_groups = await self.get_deleted_entities_groups()
357
- return group in deleted_groups
358
-
359
- async def delete_stored_entities_group(self, group: str):
360
- entities_key = KB_ENTITIES_GROUP.format(kbid=self.kbid, id=group)
361
- await self.txn.delete(entities_key)
362
-
363
- async def mark_entities_group_as_deleted(self, group: str):
364
- await datamanagers.entities.mark_group_as_deleted(self.txn, kbid=self.kbid, group=group)
365
-
366
- async def unmark_entities_group_as_deleted(self, group: str):
367
- await datamanagers.entities.unmark_group_as_deleted(self.txn, kbid=self.kbid, group=group)
368
-
369
170
  @staticmethod
370
171
  def merge_entities_groups(indexed: EntitiesGroup, stored: EntitiesGroup):
371
172
  """Create a new EntitiesGroup with the merged entities from `stored` and
@@ -20,11 +20,12 @@
20
20
 
21
21
 
22
22
  import asyncio
23
- from typing import Optional
23
+ from collections.abc import Sequence
24
24
 
25
25
  from nidx_protos.noderesources_pb2 import Resource as IndexMessage
26
26
 
27
27
  from nucliadb.common import datamanagers
28
+ from nucliadb.ingest.fields.conversation import Conversation
28
29
  from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
29
30
  from nucliadb.ingest.fields.file import File
30
31
  from nucliadb.ingest.orm.brain_v2 import ResourceBrain
@@ -32,6 +33,7 @@ from nucliadb.ingest.orm.metrics import index_message_observer as observer
32
33
  from nucliadb.ingest.orm.resource import Resource, get_file_page_positions
33
34
  from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
34
35
  from nucliadb_protos.resources_pb2 import Basic, FieldID, FieldType
36
+ from nucliadb_protos.utils_pb2 import ExtractedText
35
37
  from nucliadb_protos.writer_pb2 import BrokerMessage
36
38
 
37
39
 
@@ -68,7 +70,8 @@ class IndexMessageBuilder:
68
70
  vectors: bool = True,
69
71
  relations: bool = True,
70
72
  replace: bool = True,
71
- vectorset_configs: Optional[list[VectorSetConfig]] = None,
73
+ vectorset_configs: list[VectorSetConfig] | None = None,
74
+ append_splits: set[str] | None = None,
72
75
  ):
73
76
  field = await self.resource.get_field(fieldid.field, fieldid.field_type)
74
77
  extracted_text = await field.get_extracted_text()
@@ -120,6 +123,7 @@ class IndexMessageBuilder:
120
123
  replace_field=replace_paragraphs,
121
124
  skip_paragraphs_index=skip_paragraphs_index,
122
125
  skip_texts_index=skip_texts_index,
126
+ append_splits=append_splits,
123
127
  )
124
128
  if vectors:
125
129
  assert vectorset_configs is not None
@@ -137,6 +141,7 @@ class IndexMessageBuilder:
137
141
  vectorset=vectorset_config.vectorset_id,
138
142
  replace_field=replace,
139
143
  vector_dimension=dimension,
144
+ append_splits=append_splits,
140
145
  )
141
146
  if relations:
142
147
  await asyncio.to_thread(
@@ -150,7 +155,7 @@ class IndexMessageBuilder:
150
155
  def _apply_field_deletions(
151
156
  self,
152
157
  brain: ResourceBrain,
153
- field_ids: list[FieldID],
158
+ field_ids: Sequence[FieldID],
154
159
  ) -> None:
155
160
  for field_id in field_ids:
156
161
  brain.delete_field(self.resource.generate_field_id(field_id))
@@ -158,20 +163,19 @@ class IndexMessageBuilder:
158
163
  @observer.wrap({"type": "writer_bm"})
159
164
  async def for_writer_bm(
160
165
  self,
161
- messages: list[BrokerMessage],
166
+ message: BrokerMessage,
162
167
  resource_created: bool,
163
168
  ) -> IndexMessage:
164
169
  """
165
- Builds the index message for the broker messages coming from the writer.
170
+ Builds the index message for the broker message coming from the writer.
166
171
  The writer messages are not adding new vectors to the index.
167
172
  """
168
- assert all(message.source == BrokerMessage.MessageSource.WRITER for message in messages)
173
+ assert message.source == BrokerMessage.MessageSource.WRITER
169
174
 
170
- deleted_fields = get_bm_deleted_fields(messages)
171
- self._apply_field_deletions(self.brain, deleted_fields)
175
+ self._apply_field_deletions(self.brain, message.delete_fields)
172
176
  await self._apply_resource_index_data(self.brain)
173
177
  basic = await self.get_basic()
174
- prefilter_update = needs_prefilter_update(messages)
178
+ prefilter_update = needs_prefilter_update(message)
175
179
  if prefilter_update:
176
180
  # Changes on some metadata at the resource level that is used for filtering require that we reindex all the fields
177
181
  # in the texts index (as it is the one used for prefiltering).
@@ -181,16 +185,16 @@ class IndexMessageBuilder:
181
185
  ]
182
186
  else:
183
187
  # Simply process the fields that are in the message
184
- fields_to_index = get_bm_modified_fields(messages)
188
+ fields_to_index = get_bm_modified_fields(message)
185
189
  for fieldid in fields_to_index:
186
- if fieldid in deleted_fields:
190
+ if fieldid in message.delete_fields:
187
191
  continue
188
192
  await self._apply_field_index_data(
189
193
  self.brain,
190
194
  fieldid,
191
195
  basic,
192
- texts=prefilter_update or needs_texts_update(fieldid, messages),
193
- paragraphs=needs_paragraphs_update(fieldid, messages),
196
+ texts=prefilter_update or needs_texts_update(fieldid, message),
197
+ paragraphs=needs_paragraphs_update(fieldid, message),
194
198
  relations=False, # Relations at the field level are not modified by the writer
195
199
  vectors=False, # Vectors are never added by the writer
196
200
  replace=not resource_created,
@@ -200,32 +204,45 @@ class IndexMessageBuilder:
200
204
  @observer.wrap({"type": "processor_bm"})
201
205
  async def for_processor_bm(
202
206
  self,
203
- messages: list[BrokerMessage],
207
+ message: BrokerMessage,
204
208
  ) -> IndexMessage:
205
209
  """
206
210
  Builds the index message for the broker messages coming from the processor.
207
211
  The processor can index new data to any index.
208
212
  """
209
- assert all(message.source == BrokerMessage.MessageSource.PROCESSOR for message in messages)
210
- deleted_fields = get_bm_deleted_fields(messages)
211
- self._apply_field_deletions(self.brain, deleted_fields)
213
+ assert message.source == BrokerMessage.MessageSource.PROCESSOR
214
+ self._apply_field_deletions(self.brain, message.delete_fields)
212
215
  await self._apply_resource_index_data(self.brain)
213
216
  basic = await self.get_basic()
214
- fields_to_index = get_bm_modified_fields(messages)
217
+ fields_to_index = get_bm_modified_fields(message)
215
218
  vectorsets_configs = await self.get_vectorsets_configs()
216
219
  for fieldid in fields_to_index:
217
- if fieldid in deleted_fields:
220
+ if fieldid in message.delete_fields:
218
221
  continue
222
+
223
+ # For conversation fields, we only replace the full field if it is not an append messages operation.
224
+ # All other fields are always replaced upon modification.
225
+ replace_field = True
226
+ modified_splits = None
227
+ if fieldid.field_type == FieldType.CONVERSATION:
228
+ modified_splits = await get_bm_modified_split_ids(fieldid, message, self.resource)
229
+ stored_splits = await get_stored_split_ids(fieldid, self.resource)
230
+ is_append_messages_op = modified_splits.issubset(stored_splits) and 0 < len(
231
+ modified_splits
232
+ ) < len(stored_splits)
233
+ replace_field = not is_append_messages_op
234
+
219
235
  await self._apply_field_index_data(
220
236
  self.brain,
221
237
  fieldid,
222
238
  basic,
223
- texts=needs_texts_update(fieldid, messages),
224
- paragraphs=needs_paragraphs_update(fieldid, messages),
225
- relations=needs_relations_update(fieldid, messages),
226
- vectors=needs_vectors_update(fieldid, messages),
227
- replace=True,
239
+ texts=needs_texts_update(fieldid, message),
240
+ paragraphs=needs_paragraphs_update(fieldid, message),
241
+ relations=needs_relations_update(fieldid, message),
242
+ vectors=needs_vectors_update(fieldid, message),
243
+ replace=replace_field,
228
244
  vectorset_configs=vectorsets_configs,
245
+ append_splits=modified_splits,
229
246
  )
230
247
  return self.brain.brain
231
248
 
@@ -264,136 +281,134 @@ class IndexMessageBuilder:
264
281
  vectorset_configs = [
265
282
  vectorset_config
266
283
  async for _, vectorset_config in datamanagers.vectorsets.iter(
267
- self.resource.txn, kbid=self.resource.kb.kbid
284
+ self.resource.txn, kbid=self.resource.kbid
268
285
  )
269
286
  ]
270
287
  return vectorset_configs
271
288
 
272
289
 
273
- def get_bm_deleted_fields(
274
- messages: list[BrokerMessage],
275
- ) -> list[FieldID]:
276
- deleted = []
277
- for message in messages:
278
- for field in message.delete_fields:
279
- if field not in deleted:
280
- deleted.append(field)
281
- return deleted
282
-
283
-
284
- def get_bm_modified_fields(messages: list[BrokerMessage]) -> list[FieldID]:
285
- message_source = get_messages_source(messages)
290
+ def get_bm_modified_fields(message: BrokerMessage) -> list[FieldID]:
286
291
  modified = set()
287
- for message in messages:
288
- # Added or modified fields need indexing
289
- for link in message.links:
290
- modified.add((link, FieldType.LINK))
291
- for file in message.files:
292
- modified.add((file, FieldType.FILE))
293
- for conv in message.conversations:
294
- modified.add((conv, FieldType.CONVERSATION))
295
- for text in message.texts:
296
- modified.add((text, FieldType.TEXT))
292
+ # Added or modified fields need indexing
293
+ for link in message.links:
294
+ modified.add((link, FieldType.LINK))
295
+ for file in message.files:
296
+ modified.add((file, FieldType.FILE))
297
+ for conv in message.conversations:
298
+ modified.add((conv, FieldType.CONVERSATION))
299
+ for text in message.texts:
300
+ modified.add((text, FieldType.TEXT))
301
+ if message.HasField("basic"):
302
+ # Add title and summary only if they have changed
303
+ if message.basic.title != "":
304
+ modified.add(("title", FieldType.GENERIC))
305
+ if message.basic.summary != "":
306
+ modified.add(("summary", FieldType.GENERIC))
307
+
308
+ if message.source == BrokerMessage.MessageSource.PROCESSOR:
309
+ # Messages with field metadata, extracted text or field vectors need indexing
310
+ for fm in message.field_metadata:
311
+ modified.add((fm.field.field, fm.field.field_type))
312
+ for et in message.extracted_text:
313
+ modified.add((et.field.field, et.field.field_type))
314
+ for fv in message.field_vectors:
315
+ modified.add((fv.field.field, fv.field.field_type))
316
+
317
+ if message.source == BrokerMessage.MessageSource.WRITER:
318
+ # Any field that has fieldmetadata annotations should be considered as modified
319
+ # and needs to be reindexed
297
320
  if message.HasField("basic"):
298
- # Add title and summary only if they have changed
299
- if message.basic.title != "":
300
- modified.add(("title", FieldType.GENERIC))
301
- if message.basic.summary != "":
302
- modified.add(("summary", FieldType.GENERIC))
303
-
304
- if message_source == BrokerMessage.MessageSource.PROCESSOR:
305
- # Messages with field metadata, extracted text or field vectors need indexing
306
- for fm in message.field_metadata:
307
- modified.add((fm.field.field, fm.field.field_type))
308
- for et in message.extracted_text:
309
- modified.add((et.field.field, et.field.field_type))
310
- for fv in message.field_vectors:
311
- modified.add((fv.field.field, fv.field.field_type))
312
-
313
- if message_source == BrokerMessage.MessageSource.WRITER:
314
- # Any field that has fieldmetadata annotations should be considered as modified
315
- # and needs to be reindexed
316
- if message.HasField("basic"):
317
- for ufm in message.basic.fieldmetadata:
318
- modified.add((ufm.field.field, ufm.field.field_type))
321
+ for ufm in message.basic.fieldmetadata:
322
+ modified.add((ufm.field.field, ufm.field.field_type))
319
323
  return [FieldID(field=field, field_type=field_type) for field, field_type in modified]
320
324
 
321
325
 
322
- def get_messages_source(messages: list[BrokerMessage]) -> BrokerMessage.MessageSource.ValueType:
323
- assert len(set(message.source for message in messages)) == 1
324
- return messages[0].source
326
+ def needs_prefilter_update(message: BrokerMessage) -> bool:
327
+ return message.reindex
325
328
 
326
329
 
327
- def needs_prefilter_update(messages: list[BrokerMessage]) -> bool:
328
- return any(message.reindex for message in messages)
329
-
330
-
331
- def needs_paragraphs_update(field_id: FieldID, messages: list[BrokerMessage]) -> bool:
330
+ def needs_paragraphs_update(field_id: FieldID, message: BrokerMessage) -> bool:
332
331
  return (
333
- has_paragraph_annotations(field_id, messages)
334
- or has_new_extracted_text(field_id, messages)
335
- or has_new_field_metadata(field_id, messages)
332
+ has_paragraph_annotations(field_id, message)
333
+ or has_new_extracted_text(field_id, message)
334
+ or has_new_field_metadata(field_id, message)
336
335
  )
337
336
 
338
337
 
339
- def has_paragraph_annotations(field_id: FieldID, messages: list[BrokerMessage]) -> bool:
340
- for message in messages:
341
- ufm = next(
342
- (fm for fm in message.basic.fieldmetadata if fm.field == field_id),
343
- None,
344
- )
345
- if ufm is None:
346
- continue
347
- if len(ufm.paragraphs) > 0:
348
- return True
349
- return False
338
+ def has_paragraph_annotations(field_id: FieldID, message: BrokerMessage) -> bool:
339
+ ufm = next(
340
+ (fm for fm in message.basic.fieldmetadata if fm.field == field_id),
341
+ None,
342
+ )
343
+ if ufm is None:
344
+ return False
345
+ return len(ufm.paragraphs) > 0
350
346
 
351
347
 
352
348
  def has_new_field_metadata(
353
349
  field_id: FieldID,
354
- messages: list[BrokerMessage],
350
+ message: BrokerMessage,
355
351
  ) -> bool:
356
- for message in messages:
357
- for field_metadata in message.field_metadata:
358
- if field_metadata.field == field_id:
359
- return True
360
- return False
352
+ return any(field_metadata.field == field_id for field_metadata in message.field_metadata)
361
353
 
362
354
 
363
355
  def has_new_extracted_text(
364
356
  field_id: FieldID,
365
- messages: list[BrokerMessage],
357
+ message: BrokerMessage,
366
358
  ) -> bool:
367
- for message in messages:
368
- for extracted_text in message.extracted_text:
369
- if extracted_text.field == field_id:
370
- return True
371
- return False
359
+ return any(extracted_text.field == field_id for extracted_text in message.extracted_text)
372
360
 
373
361
 
374
362
  def needs_texts_update(
375
363
  field_id: FieldID,
376
- messages: list[BrokerMessage],
364
+ message: BrokerMessage,
377
365
  ) -> bool:
378
- return has_new_extracted_text(field_id, messages) or has_new_field_metadata(field_id, messages)
366
+ return has_new_extracted_text(field_id, message) or has_new_field_metadata(field_id, message)
379
367
 
380
368
 
381
369
  def needs_vectors_update(
382
370
  field_id: FieldID,
383
- messages: list[BrokerMessage],
371
+ message: BrokerMessage,
384
372
  ) -> bool:
385
- for message in messages:
386
- for field_vectors in message.field_vectors:
387
- if field_vectors.field == field_id:
388
- return True
389
- return False
373
+ return any(field_vectors.field == field_id for field_vectors in message.field_vectors)
374
+
375
+
376
+ async def get_bm_modified_split_ids(
377
+ conversation_field_id: FieldID,
378
+ message: BrokerMessage,
379
+ resource: Resource,
380
+ ) -> set[str]:
381
+ message_etw = next(
382
+ (etw for etw in message.extracted_text if etw.field == conversation_field_id), None
383
+ )
384
+ if message_etw is None:
385
+ return set()
386
+ storage = resource.storage
387
+ if message_etw.HasField("file"):
388
+ raw_payload = await storage.downloadbytescf(message_etw.file)
389
+ message_extracted_text = ExtractedText()
390
+ message_extracted_text.ParseFromString(raw_payload.read())
391
+ raw_payload.flush()
392
+ else:
393
+ message_extracted_text = message_etw.body
394
+ return set(message_extracted_text.split_text.keys())
395
+
396
+
397
+ async def get_stored_split_ids(
398
+ conversation_field_id: FieldID,
399
+ resource: Resource,
400
+ ) -> set[str]:
401
+ fid = conversation_field_id
402
+ conv: Conversation = await resource.get_field(fid.field, fid.field_type, load=False)
403
+ splits_metadata = await conv.get_splits_metadata()
404
+ return set(splits_metadata.metadata)
390
405
 
391
406
 
392
407
  def needs_relations_update(
393
408
  field_id: FieldID,
394
- messages: list[BrokerMessage],
409
+ message: BrokerMessage,
395
410
  ) -> bool:
396
- return has_new_field_metadata(field_id, messages) or has_new_extracted_text(field_id, messages)
411
+ return has_new_field_metadata(field_id, message) or has_new_extracted_text(field_id, message)
397
412
 
398
413
 
399
414
  async def get_resource_index_message(