nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,323 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ from dataclasses import dataclass
22
+
23
+ from nucliadb.common.ids import FieldId, ParagraphId
24
+ from nucliadb.ingest.fields.base import Field
25
+ from nucliadb.ingest.orm.resource import Resource
26
+ from nucliadb.search.augmentor.paragraphs import get_paragraph_text
27
+ from nucliadb.search.search.hydrator.fields import page_preview_id
28
+ from nucliadb.search.search.hydrator.images import paragraph_source_image
29
+ from nucliadb_models import hydration as hydration_models
30
+ from nucliadb_protos import resources_pb2
31
+ from nucliadb_protos.resources_pb2 import FieldComputedMetadata
32
+
33
+
34
+ class ParagraphIndex:
35
+ """Small helper class to cache field paragraphs and its relations and be
36
+ used as an index.
37
+
38
+ """
39
+
40
+ NEXT = "next"
41
+ PREVIOUS = "previous"
42
+ PARENTS = "parents"
43
+ SIBLINGS = "siblings"
44
+ REPLACEMENTS = "replacements"
45
+
46
+ def __init__(self, field_id: FieldId) -> None:
47
+ self.field_id = field_id
48
+ self.paragraphs: dict[str, resources_pb2.Paragraph] = {}
49
+ self.neighbours: dict[tuple[str, str], str] = {}
50
+ self.related: dict[tuple[str, str], list[str]] = {}
51
+ self._lock = asyncio.Lock()
52
+ self._built = False
53
+
54
+ async def build(self, field: Field):
55
+ """Build the index if it hasn't been built yet.
56
+
57
+ This function is async-safe, multiple concurrent tasks can ask for a
58
+ built and it'll only be done once
59
+ """
60
+ if self._built:
61
+ return
62
+
63
+ async with self._lock:
64
+ # double check we haven't built the index meanwhile we waited for the
65
+ # lock
66
+ if self._built:
67
+ return
68
+
69
+ field_metadata = await field.get_field_metadata()
70
+
71
+ if field_metadata is None:
72
+ # field metadata may be still processing. As we want to provide a
73
+ # consistent view, even if it can appear meanwhile we hydrate, we
74
+ # consider we don't have it. We mark the index as built and any
75
+ # paragraph will be found for this field
76
+ self._built = True
77
+ return None
78
+
79
+ # REVIEW: this is a CPU-bound code, we may consider running this in an
80
+ # executor to not block the loop
81
+ self._build(field_metadata)
82
+ self._built = True
83
+
84
+ def _build(self, field_metadata: FieldComputedMetadata):
85
+ self.paragraphs.clear()
86
+ self.neighbours.clear()
87
+ self.related.clear()
88
+
89
+ if self.field_id.subfield_id is None:
90
+ field_paragraphs = field_metadata.metadata.paragraphs
91
+ else:
92
+ field_paragraphs = field_metadata.split_metadata[self.field_id.subfield_id].paragraphs
93
+
94
+ previous = None
95
+ for paragraph in field_paragraphs:
96
+ paragraph_id = self.field_id.paragraph_id(paragraph.start, paragraph.end).full()
97
+ self.paragraphs[paragraph_id] = paragraph
98
+
99
+ if previous is not None:
100
+ self.neighbours[(previous, ParagraphIndex.NEXT)] = paragraph_id
101
+ self.neighbours[(paragraph_id, ParagraphIndex.PREVIOUS)] = previous
102
+ previous = paragraph_id
103
+
104
+ self.related[(paragraph_id, ParagraphIndex.PARENTS)] = [
105
+ parent for parent in paragraph.relations.parents
106
+ ]
107
+ self.related[(paragraph_id, ParagraphIndex.SIBLINGS)] = [
108
+ sibling for sibling in paragraph.relations.siblings
109
+ ]
110
+ self.related[(paragraph_id, ParagraphIndex.REPLACEMENTS)] = [
111
+ replacement for replacement in paragraph.relations.replacements
112
+ ]
113
+
114
+ def get(self, paragraph_id: str | ParagraphId) -> resources_pb2.Paragraph | None:
115
+ paragraph_id = str(paragraph_id)
116
+ return self.paragraphs.get(paragraph_id)
117
+
118
+ def previous(self, paragraph_id: str | ParagraphId) -> str | None:
119
+ paragraph_id = str(paragraph_id)
120
+ return self.neighbours.get((paragraph_id, ParagraphIndex.PREVIOUS))
121
+
122
+ def next(self, paragraph_id: str | ParagraphId) -> str | None:
123
+ paragraph_id = str(paragraph_id)
124
+ return self.neighbours.get((paragraph_id, ParagraphIndex.NEXT))
125
+
126
+ def n_previous(self, paragraph_id: str | ParagraphId, count: int = 1) -> list[str]:
127
+ assert count >= 1, f"can't find negative previous {count}"
128
+ paragraph_id = str(paragraph_id)
129
+ previous: list[str] = []
130
+ current_id = paragraph_id
131
+ for _ in range(count):
132
+ previous_id = self.previous(current_id)
133
+ if previous_id is None:
134
+ # we've reached the first paragraph
135
+ break
136
+ previous.insert(0, previous_id)
137
+ current_id = previous_id
138
+ return previous
139
+
140
+ def n_next(self, paragraph_id: str | ParagraphId, count: int = 1) -> list[str]:
141
+ assert count >= 1, f"can't find negative nexts {count}"
142
+ paragraph_id = str(paragraph_id)
143
+ nexts = []
144
+ current_id = paragraph_id
145
+ for _ in range(count):
146
+ next_id = self.next(current_id)
147
+ if next_id is None:
148
+ # we've reached the last paragraph
149
+ break
150
+ current_id = next_id
151
+ nexts.append(next_id)
152
+ return nexts
153
+
154
+ def parents(self, paragraph_id: str | ParagraphId) -> list[str]:
155
+ paragraph_id = str(paragraph_id)
156
+ return self.related.get((paragraph_id, ParagraphIndex.PARENTS), [])
157
+
158
+ def siblings(self, paragraph_id: str | ParagraphId) -> list[str]:
159
+ paragraph_id = str(paragraph_id)
160
+ return self.related.get((paragraph_id, ParagraphIndex.SIBLINGS), [])
161
+
162
+ def replacements(self, paragraph_id: str | ParagraphId) -> list[str]:
163
+ paragraph_id = str(paragraph_id)
164
+ return self.related.get((paragraph_id, ParagraphIndex.REPLACEMENTS), [])
165
+
166
+
167
+ @dataclass
168
+ class ExtraParagraphHydration:
169
+ field_page: int | None
170
+ field_table_page: int | None
171
+ related_paragraph_ids: list[ParagraphId]
172
+
173
+
174
+ async def hydrate_paragraph(
175
+ resource: Resource,
176
+ field: Field,
177
+ paragraph_id: ParagraphId,
178
+ config: hydration_models.ParagraphHydration,
179
+ field_paragraphs_index: ParagraphIndex,
180
+ ) -> tuple[hydration_models.HydratedParagraph, ExtraParagraphHydration]:
181
+ """Hydrate a paragraph and return the extra hydration to built a coherent
182
+ hydration around this paragraph.
183
+
184
+ Although the resource and field exist, the paragraph doesn't necessarily
185
+ need to be a real one in the paragraph metadata, it can be made-up to
186
+ include more or less text than the originally extracted.
187
+
188
+ """
189
+ kbid = resource.kbid
190
+
191
+ hydrated = hydration_models.HydratedParagraph(
192
+ id=paragraph_id.full(),
193
+ field=paragraph_id.field_id.full(),
194
+ resource=paragraph_id.rid,
195
+ )
196
+ extra_hydration = ExtraParagraphHydration(
197
+ field_page=None, field_table_page=None, related_paragraph_ids=[]
198
+ )
199
+
200
+ if config.text:
201
+ text = await get_paragraph_text(field, paragraph_id)
202
+ hydrated.text = text
203
+
204
+ requires_paragraph_metadata = config.image or config.table or config.page or config.related
205
+ if requires_paragraph_metadata:
206
+ await field_paragraphs_index.build(field)
207
+ paragraph = field_paragraphs_index.get(paragraph_id)
208
+ if paragraph is not None:
209
+ # otherwise, this is a fake paragraph. We can't hydrate anything else here
210
+
211
+ if config.related:
212
+ if config.related.neighbours is not None:
213
+ before = config.related.neighbours.before
214
+ after = config.related.neighbours.after
215
+ else:
216
+ before, after = None, None
217
+
218
+ hydrated.related, related_ids = await related_paragraphs_refs(
219
+ paragraph_id,
220
+ field_paragraphs_index,
221
+ neighbours_before=before,
222
+ neighbours_after=after,
223
+ parents=config.related.parents or False,
224
+ siblings=config.related.siblings or False,
225
+ replacements=config.related.replacements or False,
226
+ )
227
+ extra_hydration.related_paragraph_ids = related_ids
228
+
229
+ if config.image:
230
+ hydrated.image = hydration_models.HydratedParagraphImage()
231
+
232
+ if config.image.source_image:
233
+ hydrated.image.source_image = await paragraph_source_image(
234
+ kbid, paragraph_id, paragraph
235
+ )
236
+
237
+ if config.page:
238
+ if hydrated.page is None:
239
+ hydrated.page = hydration_models.HydratedParagraphPage()
240
+
241
+ if config.page.page_with_visual:
242
+ if paragraph.page.page_with_visual:
243
+ # Paragraphs can be found on pages with visual content. In this
244
+ # case, we want to return the preview of the paragraph page as
245
+ # an image
246
+ page_number = paragraph.page.page
247
+ # TODO: what should I do if I later find there's no page in the DB?
248
+ hydrated.page.page_preview_ref = page_preview_id(page_number)
249
+ extra_hydration.field_page = page_number
250
+
251
+ if config.table:
252
+ if hydrated.table is None:
253
+ hydrated.table = hydration_models.HydratedParagraphTable()
254
+
255
+ if config.table.table_page_preview:
256
+ if paragraph.representation.is_a_table:
257
+ # When a paragraph comes with a table and table hydration is
258
+ # enabled, we want to return the image representing that table.
259
+ # Ideally we should hydrate the paragraph reference_file, but
260
+ # table screenshots are not always perfect so we prefer to use
261
+ # the page preview. If at some point the table images are good
262
+ # enough, it'd be better to use those
263
+ page_number = paragraph.page.page
264
+ hydrated.table.page_preview_ref = page_preview_id(page_number)
265
+ extra_hydration.field_table_page = page_number
266
+
267
+ return hydrated, extra_hydration
268
+
269
+
270
+ async def related_paragraphs_refs(
271
+ paragraph_id: ParagraphId,
272
+ index: ParagraphIndex,
273
+ *,
274
+ neighbours_before: int | None = None,
275
+ neighbours_after: int | None = None,
276
+ parents: bool = False,
277
+ siblings: bool = False,
278
+ replacements: bool = False,
279
+ ) -> tuple[hydration_models.RelatedParagraphRefs, list[ParagraphId]]:
280
+ """Compute the related paragraph references for a specific `paragraph_id`
281
+ and return them with the plain list of unique related paragraphs (to
282
+ facilitate work to the caller).
283
+
284
+ """
285
+ hydrated = hydration_models.RelatedParagraphRefs()
286
+ related = set()
287
+
288
+ if neighbours_before or neighbours_after:
289
+ hydrated.neighbours = hydration_models.RelatedNeighbourParagraphRefs()
290
+
291
+ if neighbours_before is not None:
292
+ hydrated.neighbours.before = []
293
+ if neighbours_before > 0:
294
+ for previous_id in index.n_previous(paragraph_id, neighbours_before):
295
+ hydrated.neighbours.before.insert(0, previous_id)
296
+ related.add(ParagraphId.from_string(previous_id))
297
+
298
+ if neighbours_after is not None:
299
+ hydrated.neighbours.after = []
300
+ if neighbours_after > 0:
301
+ for next_id in index.n_next(paragraph_id, neighbours_after):
302
+ hydrated.neighbours.after.append(next_id)
303
+ related.add(ParagraphId.from_string(next_id))
304
+
305
+ if parents:
306
+ hydrated.parents = []
307
+ for parent_id in index.parents(paragraph_id):
308
+ hydrated.parents.append(parent_id)
309
+ related.add(ParagraphId.from_string(parent_id))
310
+
311
+ if siblings:
312
+ hydrated.siblings = []
313
+ for sibling_id in index.siblings(paragraph_id):
314
+ hydrated.siblings.append(sibling_id)
315
+ related.add(ParagraphId.from_string(sibling_id))
316
+
317
+ if replacements:
318
+ hydrated.replacements = []
319
+ for replacement_id in index.replacements(paragraph_id):
320
+ hydrated.replacements.append(replacement_id)
321
+ related.add(ParagraphId.from_string(replacement_id))
322
+
323
+ return hydrated, list(related)
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright (C) 2021 Bosutech XXI S.L.
4
+ #
5
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
6
+ # For commercial licensing, contact us at info@nuclia.com.
7
+ #
8
+ # AGPL:
9
+ # This program is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Affero General Public License as
11
+ # published by the Free Software Foundation, either version 3 of the
12
+ # License, or (at your option) any later version.
13
+ #
14
+ # This program is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Affero General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Affero General Public License
20
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
21
+ #
22
+
23
+ from nucliadb.ingest.orm.resource import Resource
24
+ from nucliadb.models.internal.augment import (
25
+ ResourceOrigin,
26
+ ResourceProp,
27
+ ResourceSecurity,
28
+ ResourceSummary,
29
+ ResourceTitle,
30
+ )
31
+ from nucliadb.search.augmentor.resources import db_augment_resource
32
+ from nucliadb_models import hydration as hydration_models
33
+
34
+
35
+ async def hydrate_resource(
36
+ resource: Resource, rid: str, config: hydration_models.ResourceHydration
37
+ ) -> hydration_models.HydratedResource:
38
+ basic = await resource.get_basic()
39
+
40
+ slug = basic.slug
41
+ hydrated = hydration_models.HydratedResource(id=rid, slug=slug)
42
+
43
+ select: list[ResourceProp] = []
44
+ if config.title:
45
+ select.append(ResourceTitle())
46
+ if config.summary:
47
+ select.append(ResourceSummary())
48
+ if config.origin:
49
+ select.append(ResourceOrigin())
50
+ if config.security:
51
+ select.append(ResourceSecurity())
52
+
53
+ augmented = await db_augment_resource(resource, select)
54
+
55
+ hydrated.title = augmented.title
56
+ hydrated.summary = augmented.summary
57
+ hydrated.origin = augmented.origin
58
+ hydrated.security = augmented.security
59
+
60
+ return hydrated
@@ -19,10 +19,10 @@
19
19
  #
20
20
  import asyncio
21
21
  from base64 import b64encode
22
- from typing import Optional
23
22
 
24
23
  from nucliadb.common import datamanagers
25
24
  from nucliadb.ingest.fields.base import Field
25
+ from nucliadb.ingest.orm.resource import Resource
26
26
  from nucliadb.search.predict_models import (
27
27
  FieldInfo,
28
28
  NameOperationFilter,
@@ -40,8 +40,8 @@ async def run_agents(
40
40
  kbid: str,
41
41
  rid: str,
42
42
  user_id: str,
43
- filters: Optional[list[AgentsFilter]] = None,
44
- agent_ids: Optional[list[str]] = None,
43
+ filters: list[AgentsFilter] | None = None,
44
+ agent_ids: list[str] | None = None,
45
45
  ) -> RunAgentsResponse:
46
46
  fields = await fetch_resource_fields(kbid, rid)
47
47
 
@@ -56,7 +56,7 @@ async def run_agents(
56
56
  return await predict.run_agents(kbid, item)
57
57
 
58
58
 
59
- def _parse_filters(filters: Optional[list[AgentsFilter]]) -> Optional[list[NameOperationFilter]]:
59
+ def _parse_filters(filters: list[AgentsFilter] | None) -> list[NameOperationFilter] | None:
60
60
  if filters is None:
61
61
  return None
62
62
  return [
@@ -69,7 +69,7 @@ def _parse_filters(filters: Optional[list[AgentsFilter]]) -> Optional[list[NameO
69
69
 
70
70
  async def fetch_resource_fields(kbid: str, rid: str) -> list[FieldInfo]:
71
71
  async with datamanagers.with_ro_transaction() as txn:
72
- resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
72
+ resource = await Resource.get(txn, kbid=kbid, rid=rid)
73
73
  if resource is None:
74
74
  raise ResourceNotFoundError()
75
75
  fields = await resource.get_fields(force=True)