nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -17,33 +17,11 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- import asyncio
21
- import logging
22
- from contextlib import AsyncExitStack
23
- from typing import Optional
24
-
25
20
  from pydantic import BaseModel
26
21
 
27
- from nucliadb.common.external_index_providers.base import TextBlockMatch
28
- from nucliadb.common.ids import FieldId
29
- from nucliadb.common.maindb.utils import get_driver
30
- from nucliadb.ingest.serialize import managed_serialize
31
- from nucliadb.search.search import cache
32
- from nucliadb.search.search.paragraphs import get_paragraph_text
33
22
  from nucliadb_models.common import FieldTypeName
34
- from nucliadb_models.resource import ExtractedDataTypeName, Resource
35
- from nucliadb_models.search import (
36
- FindParagraph,
37
- ResourceProperties,
38
- )
39
- from nucliadb_telemetry.metrics import Observer
40
- from nucliadb_utils import const
41
- from nucliadb_utils.asyncio_utils import ConcurrentRunner
42
- from nucliadb_utils.utilities import has_feature
43
-
44
- logger = logging.getLogger(__name__)
45
-
46
- hydrator_observer = Observer("hydrator", labels={"type": ""})
23
+ from nucliadb_models.resource import ExtractedDataTypeName
24
+ from nucliadb_models.search import ResourceProperties
47
25
 
48
26
 
49
27
  class ResourceHydrationOptions(BaseModel):
@@ -65,134 +43,7 @@ class TextBlockHydrationOptions(BaseModel):
65
43
  highlight: bool = False
66
44
 
67
45
  # list of exact matches to highlight
68
- ematches: Optional[list[str]] = None
46
+ ematches: list[str] | None = None
69
47
 
70
48
  # If true, only hydrate the text block if its text is not already populated
71
49
  only_hydrate_empty: bool = False
72
-
73
-
74
- @hydrator_observer.wrap({"type": "resource_text"})
75
- async def hydrate_resource_text(
76
- kbid: str, rid: str, *, max_concurrent_tasks: int
77
- ) -> list[tuple[FieldId, str]]:
78
- resource = await cache.get_resource(kbid, rid)
79
- if resource is None: # pragma: no cover
80
- return []
81
-
82
- # Schedule the extraction of the text of each field in the resource
83
- async with get_driver().ro_transaction() as txn:
84
- resource.txn = txn
85
- runner = ConcurrentRunner(max_tasks=max_concurrent_tasks)
86
- for field_type, field_key in await resource.get_fields(force=True):
87
- field_id = FieldId.from_pb(rid, field_type, field_key)
88
- runner.schedule(hydrate_field_text(kbid, field_id))
89
-
90
- # Include the summary aswell
91
- runner.schedule(hydrate_field_text(kbid, FieldId(rid=rid, type="a", key="summary")))
92
-
93
- # Wait for the results
94
- field_extracted_texts = await runner.wait()
95
-
96
- return [text for text in field_extracted_texts if text is not None]
97
-
98
-
99
- @hydrator_observer.wrap({"type": "resource_metadata"})
100
- async def hydrate_resource_metadata(
101
- kbid: str,
102
- resource_id: str,
103
- options: ResourceHydrationOptions,
104
- *,
105
- concurrency_control: Optional[asyncio.Semaphore] = None,
106
- service_name: Optional[str] = None,
107
- ) -> Optional[Resource]:
108
- """Fetch resource metadata and return it serialized."""
109
- show = options.show
110
- extracted = options.extracted
111
-
112
- if ResourceProperties.EXTRACTED in show and has_feature(
113
- const.Features.IGNORE_EXTRACTED_IN_SEARCH, context={"kbid": kbid}, default=False
114
- ):
115
- # Returning extracted metadata in search results is deprecated and this flag
116
- # will be set to True for all KBs in the future.
117
- show.remove(ResourceProperties.EXTRACTED)
118
- extracted = []
119
-
120
- async with AsyncExitStack() as stack:
121
- if concurrency_control is not None:
122
- await stack.enter_async_context(concurrency_control)
123
-
124
- async with get_driver().ro_transaction() as ro_txn:
125
- serialized_resource = await managed_serialize(
126
- txn=ro_txn,
127
- kbid=kbid,
128
- rid=resource_id,
129
- show=show,
130
- field_type_filter=options.field_type_filter,
131
- extracted=extracted,
132
- service_name=service_name,
133
- )
134
- if serialized_resource is None:
135
- logger.warning(
136
- "Resource not found in database", extra={"kbid": kbid, "rid": resource_id}
137
- )
138
- return serialized_resource
139
-
140
-
141
- @hydrator_observer.wrap({"type": "field_text"})
142
- async def hydrate_field_text(
143
- kbid: str,
144
- field_id: FieldId,
145
- ) -> Optional[tuple[FieldId, str]]:
146
- extracted_text_pb = await cache.get_extracted_text_from_field_id(kbid, field_id)
147
- if extracted_text_pb is None: # pragma: no cover
148
- return None
149
-
150
- if field_id.subfield_id:
151
- return field_id, extracted_text_pb.split_text[field_id.subfield_id]
152
- else:
153
- return field_id, extracted_text_pb.text
154
-
155
-
156
- @hydrator_observer.wrap({"type": "text_block"})
157
- async def hydrate_text_block(
158
- kbid: str,
159
- text_block: TextBlockMatch,
160
- options: TextBlockHydrationOptions,
161
- *,
162
- concurrency_control: Optional[asyncio.Semaphore] = None,
163
- ) -> TextBlockMatch:
164
- """Given a `text_block`, fetch its corresponding text, modify and return the
165
- `text_block` object.
166
-
167
- """
168
- if options.only_hydrate_empty and text_block.text:
169
- return text_block
170
- async with AsyncExitStack() as stack:
171
- if concurrency_control is not None:
172
- await stack.enter_async_context(concurrency_control)
173
-
174
- text_block.text = await get_paragraph_text(
175
- kbid=kbid,
176
- paragraph_id=text_block.paragraph_id,
177
- highlight=options.highlight,
178
- matches=[], # TODO: this was never implemented
179
- ematches=options.ematches,
180
- )
181
- return text_block
182
-
183
-
184
- def text_block_to_find_paragraph(text_block: TextBlockMatch) -> FindParagraph:
185
- return FindParagraph(
186
- id=text_block.paragraph_id.full(),
187
- text=text_block.text or "",
188
- score=text_block.score,
189
- score_type=text_block.score_type,
190
- order=text_block.order,
191
- labels=text_block.paragraph_labels,
192
- fuzzy_result=text_block.fuzzy_search,
193
- is_a_table=text_block.is_a_table,
194
- reference=text_block.representation_file,
195
- page_with_visual=text_block.page_with_visual,
196
- position=text_block.position,
197
- relevant_relations=text_block.relevant_relations,
198
- )
@@ -17,12 +17,25 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ from typing import cast
20
21
 
22
+ from typing_extensions import assert_never
21
23
 
22
24
  from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId
23
- from nucliadb.common.models_utils import from_proto
24
- from nucliadb.ingest.orm.resource import Resource
25
- from nucliadb.search.search.hydrator import hydrate_field_text
25
+ from nucliadb.ingest.fields.base import Field
26
+ from nucliadb.ingest.fields.conversation import Conversation
27
+ from nucliadb.ingest.fields.file import File
28
+ from nucliadb.ingest.fields.generic import Generic
29
+ from nucliadb.ingest.fields.link import Link
30
+ from nucliadb.ingest.fields.text import Text
31
+ from nucliadb.models.internal.augment import ConversationProp, FieldProp, FieldText, FieldValue
32
+ from nucliadb.search.augmentor.fields import (
33
+ db_augment_conversation_field,
34
+ db_augment_file_field,
35
+ db_augment_generic_field,
36
+ db_augment_link_field,
37
+ db_augment_text_field,
38
+ )
26
39
  from nucliadb_models import hydration as hydration_models
27
40
  from nucliadb_models.common import FieldTypeName
28
41
 
@@ -32,144 +45,173 @@ def page_preview_id(page_number: int) -> str:
32
45
  return f"{page_number}"
33
46
 
34
47
 
35
- async def hydrate_field(resource: Resource, field_id: FieldId, config: hydration_models.FieldHydration):
48
+ async def hydrate_field(field: Field, field_id: FieldId, config: hydration_models.FieldHydration):
36
49
  field_type = FIELD_TYPE_STR_TO_NAME[field_id.type]
37
50
 
38
51
  if field_type == FieldTypeName.TEXT:
39
52
  if not config.text is not None:
40
53
  return
41
- return await hydrate_text_field(resource, field_id, config.text)
54
+ field = cast(Text, field)
55
+ return await hydrate_text_field(field, field_id, config.text)
42
56
 
43
57
  elif field_type == FieldTypeName.FILE is not None:
44
58
  if not config.file:
45
59
  return
46
- return await hydrate_file_field(resource, field_id, config.file)
60
+ field = cast(File, field)
61
+ return await hydrate_file_field(field, field_id, config.file)
47
62
 
48
63
  elif field_type == FieldTypeName.LINK is not None:
49
64
  if not config.link:
50
65
  return
51
- return await hydrate_link_field(resource, field_id, config.link)
66
+ field = cast(Link, field)
67
+ return await hydrate_link_field(field, field_id, config.link)
52
68
 
53
69
  elif field_type == FieldTypeName.CONVERSATION is not None:
54
70
  if not config.conversation:
55
71
  return
56
- return await hydrate_conversation_field(resource, field_id, config.conversation)
72
+ field = cast(Conversation, field)
73
+ return await hydrate_conversation_field(field, field_id, config.conversation)
57
74
 
58
75
  elif field_type == FieldTypeName.GENERIC is not None:
59
76
  if not config.generic:
60
77
  return
61
- return await hydrate_generic_field(resource, field_id, config.generic)
78
+ field = cast(Generic, field)
79
+ return await hydrate_generic_field(field, field_id, config.generic)
62
80
 
63
81
  else: # pragma: no cover
64
- # This is a trick so mypy generates an error if this branch can be reached,
65
- # that is, if we are missing some ifs
66
- _a: int = "a"
82
+ assert_never(field_type)
67
83
 
68
84
 
69
85
  async def hydrate_text_field(
70
- resource: Resource,
86
+ field: Text,
71
87
  field_id: FieldId,
72
88
  config: hydration_models.TextFieldHydration,
73
89
  ) -> hydration_models.HydratedTextField:
90
+ select: list[FieldProp] = []
91
+ if config.value:
92
+ select.append(FieldValue())
93
+ if config.extracted_text:
94
+ select.append(FieldText())
95
+
96
+ augmented = await db_augment_text_field(field, field_id, select)
97
+
74
98
  hydrated = hydration_models.HydratedTextField(
75
99
  id=field_id.full(),
76
100
  resource=field_id.rid,
77
101
  field_type=FieldTypeName.TEXT,
78
102
  )
79
103
 
80
- if config.extracted_text:
81
- field_text = await hydrate_field_text(resource.kb.kbid, field_id)
82
- if field_text is not None:
83
- (_, text) = field_text
84
- hydrated.extracted = hydration_models.FieldExtractedData(text=text)
104
+ if config.value and augmented.value:
105
+ hydrated.value = augmented.value
106
+
107
+ if config.extracted_text and augmented.text:
108
+ hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
85
109
 
86
110
  return hydrated
87
111
 
88
112
 
89
113
  async def hydrate_file_field(
90
- resource: Resource,
114
+ field: File,
91
115
  field_id: FieldId,
92
116
  config: hydration_models.FileFieldHydration,
93
117
  ) -> hydration_models.HydratedFileField:
118
+ select: list[FieldProp] = []
119
+ if config.value:
120
+ select.append(FieldValue())
121
+ if config.extracted_text:
122
+ select.append(FieldText())
123
+
124
+ augmented = await db_augment_file_field(field, field_id, select)
125
+
94
126
  hydrated = hydration_models.HydratedFileField(
95
127
  id=field_id.full(),
96
128
  resource=field_id.rid,
97
129
  field_type=FieldTypeName.FILE,
98
130
  )
99
131
 
100
- if config.value:
101
- field = await resource.get_field(field_id.key, field_id.pb_type)
102
- value = await field.get_value()
103
- hydrated.value = from_proto.field_file(value)
132
+ if config.value and augmented.value:
133
+ hydrated.value = augmented.value
104
134
 
105
- if config.extracted_text:
106
- field_text = await hydrate_field_text(resource.kb.kbid, field_id)
107
- if field_text is not None:
108
- (_, text) = field_text
109
- hydrated.extracted = hydration_models.FieldExtractedData(text=text)
135
+ if config.extracted_text and augmented.text:
136
+ hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
110
137
 
111
138
  return hydrated
112
139
 
113
140
 
114
141
  async def hydrate_link_field(
115
- resource: Resource,
142
+ field: Link,
116
143
  field_id: FieldId,
117
144
  config: hydration_models.LinkFieldHydration,
118
145
  ) -> hydration_models.HydratedLinkField:
146
+ select: list[FieldProp] = []
147
+ if config.value:
148
+ select.append(FieldValue())
149
+ if config.extracted_text:
150
+ select.append(FieldText())
151
+
152
+ augmented = await db_augment_link_field(field, field_id, select)
153
+
119
154
  hydrated = hydration_models.HydratedLinkField(
120
155
  id=field_id.full(),
121
156
  resource=field_id.rid,
122
157
  field_type=FieldTypeName.LINK,
123
158
  )
124
159
 
125
- if config.value:
126
- field = await resource.get_field(field_id.key, field_id.pb_type)
127
- value = await field.get_value()
128
- hydrated.value = from_proto.field_link(value)
160
+ if config.value and augmented.value:
161
+ hydrated.value = augmented.value
129
162
 
130
- if config.extracted_text:
131
- field_text = await hydrate_field_text(resource.kb.kbid, field_id)
132
- if field_text is not None:
133
- (_, text) = field_text
134
- hydrated.extracted = hydration_models.FieldExtractedData(text=text)
163
+ if config.extracted_text and augmented.text:
164
+ hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
135
165
 
136
166
  return hydrated
137
167
 
138
168
 
139
169
  async def hydrate_conversation_field(
140
- resource: Resource,
170
+ field: Conversation,
141
171
  field_id: FieldId,
142
172
  config: hydration_models.ConversationFieldHydration,
143
173
  ) -> hydration_models.HydratedConversationField:
174
+ select: list[ConversationProp] = []
175
+ if config.value:
176
+ select.append(FieldValue())
177
+
178
+ augmented = await db_augment_conversation_field(field, field_id, select)
179
+
144
180
  hydrated = hydration_models.HydratedConversationField(
145
181
  id=field_id.full(),
146
182
  resource=field_id.rid,
147
183
  field_type=FieldTypeName.CONVERSATION,
148
184
  )
149
- # TODO: implement conversation fields
185
+
186
+ if config.value and augmented.value:
187
+ hydrated.value = augmented.value
188
+
150
189
  return hydrated
151
190
 
152
191
 
153
192
  async def hydrate_generic_field(
154
- resource: Resource,
193
+ field: Generic,
155
194
  field_id: FieldId,
156
195
  config: hydration_models.GenericFieldHydration,
157
196
  ) -> hydration_models.HydratedGenericField:
197
+ select: list[FieldProp] = []
198
+ if config.value:
199
+ select.append(FieldValue())
200
+ if config.extracted_text:
201
+ select.append(FieldText())
202
+
203
+ augmented = await db_augment_generic_field(field, field_id, select)
204
+
158
205
  hydrated = hydration_models.HydratedGenericField(
159
206
  id=field_id.full(),
160
207
  resource=field_id.rid,
161
208
  field_type=FieldTypeName.GENERIC,
162
209
  )
163
210
 
164
- if config.value:
165
- field = await resource.get_field(field_id.key, field_id.pb_type)
166
- value = await field.get_value()
167
- hydrated.value = value
211
+ if config.value and augmented.value:
212
+ hydrated.value = augmented.value
168
213
 
169
- if config.extracted_text:
170
- field_text = await hydrate_field_text(resource.kb.kbid, field_id)
171
- if field_text is not None:
172
- (_, text) = field_text
173
- hydrated.extracted = hydration_models.FieldExtractedData(text=text)
214
+ if config.extracted_text and augmented.text:
215
+ hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
174
216
 
175
217
  return hydrated
@@ -18,7 +18,9 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import base64
21
- from typing import Optional, cast
21
+ from typing import cast
22
+
23
+ from typing_extensions import assert_never
22
24
 
23
25
  from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId, ParagraphId
24
26
  from nucliadb.ingest.fields.base import Field
@@ -32,7 +34,7 @@ from nucliadb_utils.utilities import get_storage
32
34
 
33
35
  async def paragraph_source_image(
34
36
  kbid: str, paragraph_id: ParagraphId, paragraph: resources_pb2.Paragraph
35
- ) -> Optional[Image]:
37
+ ) -> Image | None:
36
38
  """Certain paragraphs are extracted from images using techniques like OCR or
37
39
  inception. If that's the case, return the original image for this paragraph.
38
40
 
@@ -66,7 +68,7 @@ async def paragraph_source_image(
66
68
 
67
69
  async def download_image(
68
70
  kbid: str, field_id: FieldId, image_path: str, *, mime_type: str
69
- ) -> Optional[Image]:
71
+ ) -> Image | None:
70
72
  storage = await get_storage(service_name=SERVICE_NAME)
71
73
  sf = storage.file_extracted(
72
74
  kbid,
@@ -81,7 +83,7 @@ async def download_image(
81
83
  return Image(content_type=mime_type, b64encoded=base64.b64encode(raw_image).decode())
82
84
 
83
85
 
84
- async def download_page_preview(field: Field, page: int) -> Optional[Image]:
86
+ async def download_page_preview(field: Field, page: int) -> Image | None:
85
87
  """Download a specific page preview for a field and return it as an Image.
86
88
  As not all fields have previews, this function can return None.
87
89
 
@@ -123,8 +125,6 @@ async def download_page_preview(field: Field, page: int) -> Optional[Image]:
123
125
  image = None
124
126
 
125
127
  else: # pragma: no cover
126
- # This is a trick so mypy generates an error if this branch can be reached,
127
- # that is, if we are missing some ifs
128
- _a: int = "a"
128
+ assert_never(field_type)
129
129
 
130
130
  return image
@@ -19,12 +19,11 @@
19
19
  #
20
20
  import asyncio
21
21
  from dataclasses import dataclass
22
- from typing import Optional, Union
23
22
 
24
23
  from nucliadb.common.ids import FieldId, ParagraphId
25
24
  from nucliadb.ingest.fields.base import Field
26
25
  from nucliadb.ingest.orm.resource import Resource
27
- from nucliadb.search.search import paragraphs
26
+ from nucliadb.search.augmentor.paragraphs import get_paragraph_text
28
27
  from nucliadb.search.search.hydrator.fields import page_preview_id
29
28
  from nucliadb.search.search.hydrator.images import paragraph_source_image
30
29
  from nucliadb_models import hydration as hydration_models
@@ -112,19 +111,19 @@ class ParagraphIndex:
112
111
  replacement for replacement in paragraph.relations.replacements
113
112
  ]
114
113
 
115
- def get(self, paragraph_id: Union[str, ParagraphId]) -> Optional[resources_pb2.Paragraph]:
114
+ def get(self, paragraph_id: str | ParagraphId) -> resources_pb2.Paragraph | None:
116
115
  paragraph_id = str(paragraph_id)
117
116
  return self.paragraphs.get(paragraph_id)
118
117
 
119
- def previous(self, paragraph_id: Union[str, ParagraphId]) -> Optional[str]:
118
+ def previous(self, paragraph_id: str | ParagraphId) -> str | None:
120
119
  paragraph_id = str(paragraph_id)
121
120
  return self.neighbours.get((paragraph_id, ParagraphIndex.PREVIOUS))
122
121
 
123
- def next(self, paragraph_id: Union[str, ParagraphId]) -> Optional[str]:
122
+ def next(self, paragraph_id: str | ParagraphId) -> str | None:
124
123
  paragraph_id = str(paragraph_id)
125
124
  return self.neighbours.get((paragraph_id, ParagraphIndex.NEXT))
126
125
 
127
- def n_previous(self, paragraph_id: Union[str, ParagraphId], count: int = 1) -> list[str]:
126
+ def n_previous(self, paragraph_id: str | ParagraphId, count: int = 1) -> list[str]:
128
127
  assert count >= 1, f"can't find negative previous {count}"
129
128
  paragraph_id = str(paragraph_id)
130
129
  previous: list[str] = []
@@ -138,7 +137,7 @@ class ParagraphIndex:
138
137
  current_id = previous_id
139
138
  return previous
140
139
 
141
- def n_next(self, paragraph_id: Union[str, ParagraphId], count: int = 1) -> list[str]:
140
+ def n_next(self, paragraph_id: str | ParagraphId, count: int = 1) -> list[str]:
142
141
  assert count >= 1, f"can't find negative nexts {count}"
143
142
  paragraph_id = str(paragraph_id)
144
143
  nexts = []
@@ -152,23 +151,23 @@ class ParagraphIndex:
152
151
  nexts.append(next_id)
153
152
  return nexts
154
153
 
155
- def parents(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
154
+ def parents(self, paragraph_id: str | ParagraphId) -> list[str]:
156
155
  paragraph_id = str(paragraph_id)
157
156
  return self.related.get((paragraph_id, ParagraphIndex.PARENTS), [])
158
157
 
159
- def siblings(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
158
+ def siblings(self, paragraph_id: str | ParagraphId) -> list[str]:
160
159
  paragraph_id = str(paragraph_id)
161
160
  return self.related.get((paragraph_id, ParagraphIndex.SIBLINGS), [])
162
161
 
163
- def replacements(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
162
+ def replacements(self, paragraph_id: str | ParagraphId) -> list[str]:
164
163
  paragraph_id = str(paragraph_id)
165
164
  return self.related.get((paragraph_id, ParagraphIndex.REPLACEMENTS), [])
166
165
 
167
166
 
168
167
  @dataclass
169
168
  class ExtraParagraphHydration:
170
- field_page: Optional[int]
171
- field_table_page: Optional[int]
169
+ field_page: int | None
170
+ field_table_page: int | None
172
171
  related_paragraph_ids: list[ParagraphId]
173
172
 
174
173
 
@@ -187,7 +186,7 @@ async def hydrate_paragraph(
187
186
  include more or less text than the originally extracted.
188
187
 
189
188
  """
190
- kbid = resource.kb.kbid
189
+ kbid = resource.kbid
191
190
 
192
191
  hydrated = hydration_models.HydratedParagraph(
193
192
  id=paragraph_id.full(),
@@ -199,7 +198,7 @@ async def hydrate_paragraph(
199
198
  )
200
199
 
201
200
  if config.text:
202
- text = await paragraphs.get_paragraph_text(kbid=kbid, paragraph_id=paragraph_id)
201
+ text = await get_paragraph_text(field, paragraph_id)
203
202
  hydrated.text = text
204
203
 
205
204
  requires_paragraph_metadata = config.image or config.table or config.page or config.related
@@ -210,8 +209,20 @@ async def hydrate_paragraph(
210
209
  # otherwise, this is a fake paragraph. We can't hydrate anything else here
211
210
 
212
211
  if config.related:
212
+ if config.related.neighbours is not None:
213
+ before = config.related.neighbours.before
214
+ after = config.related.neighbours.after
215
+ else:
216
+ before, after = None, None
217
+
213
218
  hydrated.related, related_ids = await related_paragraphs_refs(
214
- paragraph_id, field_paragraphs_index, config.related
219
+ paragraph_id,
220
+ field_paragraphs_index,
221
+ neighbours_before=before,
222
+ neighbours_after=after,
223
+ parents=config.related.parents or False,
224
+ siblings=config.related.siblings or False,
225
+ replacements=config.related.replacements or False,
215
226
  )
216
227
  extra_hydration.related_paragraph_ids = related_ids
217
228
 
@@ -259,7 +270,12 @@ async def hydrate_paragraph(
259
270
  async def related_paragraphs_refs(
260
271
  paragraph_id: ParagraphId,
261
272
  index: ParagraphIndex,
262
- config: hydration_models.RelatedParagraphHydration,
273
+ *,
274
+ neighbours_before: int | None = None,
275
+ neighbours_after: int | None = None,
276
+ parents: bool = False,
277
+ siblings: bool = False,
278
+ replacements: bool = False,
263
279
  ) -> tuple[hydration_models.RelatedParagraphRefs, list[ParagraphId]]:
264
280
  """Compute the related paragraph references for a specific `paragraph_id`
265
281
  and return them with the plain list of unique related paragraphs (to
@@ -269,36 +285,36 @@ async def related_paragraphs_refs(
269
285
  hydrated = hydration_models.RelatedParagraphRefs()
270
286
  related = set()
271
287
 
272
- if config.neighbours:
288
+ if neighbours_before or neighbours_after:
273
289
  hydrated.neighbours = hydration_models.RelatedNeighbourParagraphRefs()
274
290
 
275
- if config.neighbours.before is not None:
291
+ if neighbours_before is not None:
276
292
  hydrated.neighbours.before = []
277
- if config.neighbours.before > 0:
278
- for previous_id in index.n_previous(paragraph_id, config.neighbours.before):
293
+ if neighbours_before > 0:
294
+ for previous_id in index.n_previous(paragraph_id, neighbours_before):
279
295
  hydrated.neighbours.before.insert(0, previous_id)
280
296
  related.add(ParagraphId.from_string(previous_id))
281
297
 
282
- if config.neighbours.after is not None:
298
+ if neighbours_after is not None:
283
299
  hydrated.neighbours.after = []
284
- if config.neighbours.after > 0:
285
- for next_id in index.n_next(paragraph_id, config.neighbours.after):
300
+ if neighbours_after > 0:
301
+ for next_id in index.n_next(paragraph_id, neighbours_after):
286
302
  hydrated.neighbours.after.append(next_id)
287
303
  related.add(ParagraphId.from_string(next_id))
288
304
 
289
- if config.parents:
305
+ if parents:
290
306
  hydrated.parents = []
291
307
  for parent_id in index.parents(paragraph_id):
292
308
  hydrated.parents.append(parent_id)
293
309
  related.add(ParagraphId.from_string(parent_id))
294
310
 
295
- if config.siblings:
311
+ if siblings:
296
312
  hydrated.siblings = []
297
313
  for sibling_id in index.siblings(paragraph_id):
298
314
  hydrated.siblings.append(sibling_id)
299
315
  related.add(ParagraphId.from_string(sibling_id))
300
316
 
301
- if config.replacements:
317
+ if replacements:
302
318
  hydrated.replacements = []
303
319
  for replacement_id in index.replacements(paragraph_id):
304
320
  hydrated.replacements.append(replacement_id)