nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,24 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+
22
+ from nucliadb_telemetry.metrics import Observer
23
+
24
+ augmentor_observer = Observer("augmentor", labels={"type": ""})
@@ -0,0 +1,334 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ from collections.abc import Sequence
22
+ from typing import cast
23
+
24
+ from typing_extensions import assert_never
25
+
26
+ from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, ParagraphId
27
+ from nucliadb.ingest.fields.base import Field
28
+ from nucliadb.ingest.orm.resource import Resource
29
+ from nucliadb.models.internal.augment import (
30
+ AugmentedParagraph,
31
+ AugmentedRelatedParagraphs,
32
+ Metadata,
33
+ Paragraph,
34
+ ParagraphImage,
35
+ ParagraphPage,
36
+ ParagraphPosition,
37
+ ParagraphProp,
38
+ ParagraphTable,
39
+ ParagraphText,
40
+ RelatedParagraphs,
41
+ )
42
+ from nucliadb.search.augmentor.metrics import augmentor_observer
43
+ from nucliadb.search.augmentor.utils import limited_concurrency
44
+ from nucliadb.search.search import cache
45
+ from nucliadb.search.search.paragraphs import get_paragraph_from_full_text
46
+ from nucliadb_models.search import TextPosition
47
+ from nucliadb_protos import resources_pb2
48
+
49
+
50
+ async def augment_paragraphs(
51
+ kbid: str,
52
+ given: list[Paragraph],
53
+ select: list[ParagraphProp],
54
+ *,
55
+ concurrency_control: asyncio.Semaphore | None = None,
56
+ ) -> dict[ParagraphId, AugmentedParagraph | None]:
57
+ """Augment a list of paragraphs following an augmentation"""
58
+
59
+ ops = []
60
+ for paragraph in given:
61
+ task = asyncio.create_task(
62
+ limited_concurrency(
63
+ augment_paragraph(kbid, paragraph.id, select, paragraph.metadata),
64
+ max_ops=concurrency_control,
65
+ )
66
+ )
67
+ ops.append(task)
68
+ results: list[AugmentedParagraph | None] = await asyncio.gather(*ops)
69
+
70
+ augmented = {}
71
+ for paragraph, augmentation in zip(given, results):
72
+ augmented[paragraph.id] = augmentation
73
+
74
+ return augmented
75
+
76
+
77
+ async def augment_paragraph(
78
+ kbid: str,
79
+ paragraph_id: ParagraphId,
80
+ select: list[ParagraphProp],
81
+ metadata: Metadata | None,
82
+ ) -> AugmentedParagraph | None:
83
+ rid = paragraph_id.rid
84
+ resource = await cache.get_resource(kbid, rid)
85
+ if resource is None:
86
+ # skip resources that aren't in the DB
87
+ return None
88
+
89
+ field_id = paragraph_id.field_id
90
+ field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
91
+ # we must check if field exists or get_field will return an empty field
92
+ # (behaviour thought for ingestion) that we don't want
93
+ if not (await resource.field_exists(field_type_pb, field_id.key)):
94
+ # skip a fields that aren't in the DB
95
+ return None
96
+ field = await resource.get_field(field_id.key, field_id.pb_type)
97
+
98
+ return await db_augment_paragraph(resource, field, paragraph_id, select, metadata)
99
+
100
+
101
+ async def db_augment_paragraph(
102
+ resource: Resource,
103
+ field: Field,
104
+ paragraph_id: ParagraphId,
105
+ select: list[ParagraphProp],
106
+ metadata: Metadata | None,
107
+ ) -> AugmentedParagraph:
108
+ select = dedup_paragraph_select(select)
109
+
110
+ # we use an accessor to get the metadata to avoid unnecessary DB round
111
+ # trips. With this, we'll only fetch it one and only if we need it
112
+ _metadata = metadata
113
+ _metadata_available = True
114
+
115
+ async def access_metadata() -> Metadata | None:
116
+ nonlocal _metadata, _metadata_available
117
+
118
+ if _metadata is None and _metadata_available:
119
+ _metadata = await db_paragraph_metadata(field, paragraph_id)
120
+
121
+ if _metadata is None:
122
+ _metadata_available = False
123
+
124
+ return _metadata
125
+
126
+ text = None
127
+ position = None
128
+ image_path = None
129
+ table_path = None
130
+ page_preview_path = None
131
+ related = None
132
+ for prop in select:
133
+ if isinstance(prop, ParagraphText):
134
+ text = await get_paragraph_text(field, paragraph_id)
135
+
136
+ elif isinstance(prop, ParagraphPosition):
137
+ position = await get_paragraph_position(field, paragraph_id)
138
+
139
+ elif isinstance(prop, ParagraphImage):
140
+ metadata = await access_metadata()
141
+ if metadata is None:
142
+ continue
143
+ if metadata.is_an_image and metadata.source_file:
144
+ image_path = f"generated/{metadata.source_file}"
145
+
146
+ elif isinstance(prop, ParagraphTable):
147
+ metadata = await access_metadata()
148
+ if metadata is None:
149
+ continue
150
+ if metadata.is_a_table:
151
+ if prop.prefer_page_preview and metadata.page and metadata.in_page_with_visual:
152
+ page_preview_path = f"generated/extracted_images_{metadata.page}.png"
153
+ table_path = page_preview_path
154
+ elif metadata.source_file:
155
+ image_path = f"generated/{metadata.source_file}"
156
+ table_path = image_path
157
+
158
+ elif isinstance(prop, ParagraphPage):
159
+ if prop.preview:
160
+ metadata = await access_metadata()
161
+ if metadata is None:
162
+ continue
163
+ if metadata.page and metadata.in_page_with_visual:
164
+ page_preview_path = f"generated/extracted_images_{metadata.page}.png"
165
+
166
+ elif isinstance(prop, RelatedParagraphs):
167
+ related = await related_paragraphs(
168
+ field,
169
+ paragraph_id,
170
+ neighbours_before=prop.neighbours_before,
171
+ neighbours_after=prop.neighbours_after,
172
+ )
173
+
174
+ else: # pragma: no cover
175
+ assert_never(prop)
176
+
177
+ return AugmentedParagraph(
178
+ id=paragraph_id,
179
+ text=text,
180
+ position=position,
181
+ source_image_path=image_path,
182
+ table_image_path=table_path,
183
+ page_preview_path=page_preview_path,
184
+ related=related,
185
+ )
186
+
187
+
188
+ def dedup_paragraph_select(select: list[ParagraphProp]) -> list[ParagraphProp]:
189
+ """Merge any duplicated property taking the broader augmentation possible."""
190
+ merged = {}
191
+ for prop in select:
192
+ if prop.prop not in merged:
193
+ merged[prop.prop] = prop
194
+
195
+ else:
196
+ m = merged[prop.prop]
197
+
198
+ if (
199
+ isinstance(prop, ParagraphText)
200
+ or isinstance(prop, ParagraphPosition)
201
+ or isinstance(prop, ParagraphImage)
202
+ ):
203
+ # properties without parameters
204
+ pass
205
+
206
+ elif isinstance(prop, ParagraphTable):
207
+ prop = cast(ParagraphTable, prop)
208
+ m = cast(ParagraphTable, m)
209
+ m.prefer_page_preview = m.prefer_page_preview or prop.prefer_page_preview
210
+
211
+ elif isinstance(prop, ParagraphPage):
212
+ prop = cast(ParagraphPage, prop)
213
+ m = cast(ParagraphPage, m)
214
+ m.preview = m.preview or prop.preview
215
+
216
+ elif isinstance(prop, RelatedParagraphs):
217
+ prop = cast(RelatedParagraphs, prop)
218
+ m = cast(RelatedParagraphs, m)
219
+ m.neighbours_before = max(m.neighbours_before, prop.neighbours_before)
220
+ m.neighbours_after = max(m.neighbours_after, prop.neighbours_after)
221
+
222
+ else: # pragma: no cover
223
+ assert_never(prop)
224
+
225
+ return list(merged.values())
226
+
227
+
228
+ async def db_paragraph_metadata(field: Field, paragraph_id: ParagraphId) -> Metadata | None:
229
+ """Obtain paragraph metadata from the source of truth (maindb/blob).
230
+
231
+ This operation may require data from blob storage, which makes it costly.
232
+
233
+ """
234
+ field_paragraphs = await get_field_paragraphs(field)
235
+ if field_paragraphs is None:
236
+ # We don't have paragraph metadata for this field, we can't do anything
237
+ return None
238
+
239
+ for paragraph in field_paragraphs:
240
+ field_paragraph_id = field.field_id.paragraph_id(paragraph.start, paragraph.end)
241
+ if field_paragraph_id == paragraph_id:
242
+ metadata = Metadata.from_db_paragraph(paragraph)
243
+ return metadata
244
+ else:
245
+ return None
246
+
247
+
248
+ async def get_field_paragraphs(field: Field) -> Sequence[resources_pb2.Paragraph] | None:
249
+ field_metadata = await field.get_field_metadata()
250
+ if field_metadata is None:
251
+ return None
252
+
253
+ field_id = field.field_id
254
+ if field_id.subfield_id is None:
255
+ field_paragraphs = field_metadata.metadata.paragraphs
256
+ else:
257
+ field_paragraphs = field_metadata.split_metadata[field_id.subfield_id].paragraphs
258
+
259
+ return field_paragraphs
260
+
261
+
262
+ @augmentor_observer.wrap({"type": "paragraph_text"})
263
+ async def get_paragraph_text(field: Field, paragraph_id: ParagraphId) -> str | None:
264
+ text = await get_paragraph_from_full_text(
265
+ field=field,
266
+ start=paragraph_id.paragraph_start,
267
+ end=paragraph_id.paragraph_end,
268
+ split=paragraph_id.field_id.subfield_id,
269
+ log_on_missing_field=True,
270
+ )
271
+ # we want to be explicit with not having the paragraph text but the function
272
+ # above returns an empty string if it can't find it
273
+ return text or None
274
+
275
+
276
+ async def get_paragraph_position(field: Field, paragraph_id: ParagraphId) -> TextPosition | None:
277
+ field_paragraphs = await get_field_paragraphs(field)
278
+ if field_paragraphs is None:
279
+ return None
280
+
281
+ idx: int | None
282
+ for idx, paragraph in enumerate(field_paragraphs):
283
+ field_paragraph_id = field.field_id.paragraph_id(paragraph.start, paragraph.end)
284
+ if field_paragraph_id == paragraph_id:
285
+ break
286
+ else:
287
+ # we haven't found the paragraph, we can't provide a position
288
+ return None
289
+
290
+ return TextPosition(
291
+ index=idx,
292
+ start=paragraph.start,
293
+ end=paragraph.end,
294
+ start_seconds=list(paragraph.start_seconds),
295
+ end_seconds=list(paragraph.end_seconds),
296
+ )
297
+
298
+
299
+ async def related_paragraphs(
300
+ field: Field,
301
+ paragraph_id: ParagraphId,
302
+ *,
303
+ neighbours_before: int = 0,
304
+ neighbours_after: int = 0,
305
+ ) -> AugmentedRelatedParagraphs | None:
306
+ field_paragraphs = await get_field_paragraphs(field)
307
+ if field_paragraphs is None:
308
+ return None
309
+
310
+ idx: int | None
311
+ for idx, paragraph in enumerate(field_paragraphs):
312
+ field_paragraph_id = field.field_id.paragraph_id(paragraph.start, paragraph.end)
313
+ if field_paragraph_id == paragraph_id:
314
+ break
315
+ else:
316
+ # we haven't found the paragraph, we won't find any related either
317
+ return None
318
+
319
+ before = []
320
+ for idx_before in range(max(idx - neighbours_before, 0), idx):
321
+ paragraph = field_paragraphs[idx_before]
322
+ paragraph_id = field.field_id.paragraph_id(paragraph.start, paragraph.end)
323
+ before.append(paragraph_id)
324
+
325
+ after = []
326
+ for idx_after in range(idx + 1, min(idx + 1 + neighbours_after, len(field_paragraphs))):
327
+ paragraph = field_paragraphs[idx_after]
328
+ paragraph_id = field.field_id.paragraph_id(paragraph.start, paragraph.end)
329
+ after.append(paragraph_id)
330
+
331
+ return AugmentedRelatedParagraphs(
332
+ neighbours_before=before,
333
+ neighbours_after=after,
334
+ )
@@ -0,0 +1,238 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+
22
+ from typing_extensions import assert_never
23
+
24
+ import nucliadb_models.resource
25
+ from nucliadb.common import datamanagers
26
+ from nucliadb.ingest.orm.resource import Resource
27
+ from nucliadb.ingest.serialize import (
28
+ serialize_extra,
29
+ serialize_origin,
30
+ serialize_resource,
31
+ serialize_security,
32
+ )
33
+ from nucliadb.models.internal.augment import (
34
+ AugmentedResource,
35
+ ResourceClassificationLabels,
36
+ ResourceExtra,
37
+ ResourceOrigin,
38
+ ResourceProp,
39
+ ResourceSecurity,
40
+ ResourceSummary,
41
+ ResourceTitle,
42
+ )
43
+ from nucliadb.search.augmentor.metrics import augmentor_observer
44
+ from nucliadb.search.augmentor.utils import limited_concurrency
45
+ from nucliadb.search.search import cache
46
+ from nucliadb.search.search.hydrator import ResourceHydrationOptions
47
+ from nucliadb_models.search import ResourceProperties
48
+ from nucliadb_protos import resources_pb2
49
+ from nucliadb_utils import const
50
+ from nucliadb_utils.utilities import has_feature
51
+
52
+
53
+ async def augment_resources(
54
+ kbid: str,
55
+ given: list[str],
56
+ select: list[ResourceProp],
57
+ *,
58
+ concurrency_control: asyncio.Semaphore | None = None,
59
+ ) -> dict[str, AugmentedResource | None]:
60
+ """Augment a list of resources following an augmentation"""
61
+
62
+ ops = []
63
+ for rid in given:
64
+ task = asyncio.create_task(
65
+ limited_concurrency(
66
+ augment_resource(kbid, rid, select),
67
+ max_ops=concurrency_control,
68
+ )
69
+ )
70
+ ops.append(task)
71
+ results: list[AugmentedResource | None] = await asyncio.gather(*ops)
72
+
73
+ augmented = {}
74
+ for rid, augmentation in zip(given, results):
75
+ augmented[rid] = augmentation
76
+
77
+ return augmented
78
+
79
+
80
+ async def augment_resource(
81
+ kbid: str,
82
+ rid: str,
83
+ select: list[ResourceProp],
84
+ ) -> AugmentedResource | None:
85
+ resource = await cache.get_resource(kbid, rid)
86
+ if resource is None:
87
+ # skip resources that aren't in the DB
88
+ return None
89
+
90
+ return await db_augment_resource(resource, select)
91
+
92
+
93
+ @augmentor_observer.wrap({"type": "db_resource"})
94
+ async def db_augment_resource(
95
+ resource: Resource,
96
+ select: list[ResourceProp],
97
+ ) -> AugmentedResource:
98
+ select = dedup_resource_select(select)
99
+
100
+ title = None
101
+ summary = None
102
+ origin = None
103
+ extra = None
104
+ security = None
105
+ labels = None
106
+
107
+ basic = None
108
+ for prop in select:
109
+ if isinstance(prop, ResourceTitle):
110
+ if basic is None:
111
+ basic = await resource.get_basic()
112
+ if basic is not None:
113
+ title = basic.title
114
+
115
+ elif isinstance(prop, ResourceSummary):
116
+ if basic is None:
117
+ basic = await resource.get_basic()
118
+ if basic is not None:
119
+ summary = basic.summary
120
+
121
+ elif isinstance(prop, ResourceOrigin):
122
+ origin = await serialize_origin(resource)
123
+
124
+ elif isinstance(prop, ResourceExtra):
125
+ extra = await serialize_extra(resource)
126
+
127
+ elif isinstance(prop, ResourceSecurity):
128
+ security = await serialize_security(resource)
129
+
130
+ elif isinstance(prop, ResourceClassificationLabels):
131
+ labels = await classification_labels(resource)
132
+
133
+ else:
134
+ assert_never(prop)
135
+
136
+ augmented = AugmentedResource(
137
+ id=resource.uuid,
138
+ title=title,
139
+ summary=summary,
140
+ origin=origin,
141
+ extra=extra,
142
+ security=security,
143
+ classification_labels=labels,
144
+ )
145
+ return augmented
146
+
147
+
148
+ def dedup_resource_select(select: list[ResourceProp]) -> list[ResourceProp]:
149
+ # there's no resource prop with fields that need special treatement to
150
+ # merge, just get by unique prop id
151
+ merged: dict[str, ResourceProp] = {}
152
+ for prop in select:
153
+ merged.setdefault(prop.prop, prop)
154
+ return list(merged.values())
155
+
156
+
157
+ async def get_basic(resource: Resource) -> resources_pb2.Basic | None:
158
+ # HACK: resource.get_basic() always returns a pb, even if it's not in the
159
+ # DB. Here we really want to know if there's basic or not
160
+ basic = await datamanagers.resources.get_basic(resource.txn, kbid=resource.kbid, rid=resource.uuid)
161
+ return basic
162
+
163
+
164
+ async def classification_labels(resource: Resource) -> dict[str, set[str]] | None:
165
+ basic = await get_basic(resource)
166
+ if basic is None:
167
+ return None
168
+
169
+ labels: dict[str, set[str]] = {}
170
+ for classification in basic.usermetadata.classifications:
171
+ labels.setdefault(classification.labelset, set()).add(classification.label)
172
+ return labels
173
+
174
+
175
+ async def augment_resources_deep(
176
+ kbid: str,
177
+ given: list[str],
178
+ opts: ResourceHydrationOptions,
179
+ *,
180
+ concurrency_control: asyncio.Semaphore | None = None,
181
+ ) -> dict[str, nucliadb_models.resource.Resource | None]:
182
+ """Augment resources using the Resource model. Depending on the options,
183
+ this can serialize resource fields, extracted data like text, vectors...
184
+
185
+ Thus, this operation can be quite expensive.
186
+
187
+ """
188
+
189
+ if ResourceProperties.EXTRACTED in opts.show and has_feature(
190
+ const.Features.IGNORE_EXTRACTED_IN_SEARCH, context={"kbid": kbid}, default=False
191
+ ):
192
+ # Returning extracted metadata in search results is deprecated and this flag
193
+ # will be set to True for all KBs in the future.
194
+ opts.show.remove(ResourceProperties.EXTRACTED)
195
+ opts.extracted.clear()
196
+
197
+ ops = []
198
+ for rid in given:
199
+ task = asyncio.create_task(
200
+ limited_concurrency(
201
+ augment_resource_deep(kbid, rid, opts),
202
+ max_ops=concurrency_control,
203
+ )
204
+ )
205
+ ops.append(task)
206
+ results: list[nucliadb_models.resource.Resource | None] = await asyncio.gather(*ops)
207
+
208
+ augmented: dict[str, nucliadb_models.resource.Resource | None] = {}
209
+ for rid, augmentation in zip(given, results):
210
+ augmented[rid] = augmentation
211
+
212
+ return augmented
213
+
214
+
215
+ @augmentor_observer.wrap({"type": "seialize_resource"})
216
+ async def augment_resource_deep(
217
+ kbid: str,
218
+ rid: str,
219
+ opts: ResourceHydrationOptions,
220
+ ) -> nucliadb_models.resource.Resource | None:
221
+ """Augment a resource using the Resource model. Depending on the options,
222
+ this can serialize resource fields, extracted data like text, vectors...
223
+
224
+ Thus, this operation can be quite expensive.
225
+
226
+ """
227
+ resource = await cache.get_resource(kbid, rid)
228
+ if resource is None:
229
+ # skip resources that aren't in the DB
230
+ return None
231
+
232
+ serialized = await serialize_resource(
233
+ resource,
234
+ show=opts.show,
235
+ field_type_filter=opts.field_type_filter,
236
+ extracted=opts.extracted,
237
+ )
238
+ return serialized
@@ -0,0 +1,33 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ from collections.abc import Awaitable
22
+ from contextlib import AsyncExitStack
23
+ from typing import TypeVar
24
+
25
+ T = TypeVar("T")
26
+
27
+
28
+ async def limited_concurrency(aw: Awaitable[T], *, max_ops: asyncio.Semaphore | None) -> T:
29
+ async with AsyncExitStack() as stack:
30
+ if max_ops is not None:
31
+ await stack.enter_async_context(max_ops)
32
+ r = await aw
33
+ return r
@@ -24,7 +24,7 @@ from fastapi import FastAPI
24
24
  from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
25
25
  from nucliadb.common.context.fastapi import inject_app_context
26
26
  from nucliadb.common.maindb.utils import setup_driver
27
- from nucliadb.common.nidx import start_nidx_utility
27
+ from nucliadb.common.nidx import start_nidx_utility, stop_nidx_utility
28
28
  from nucliadb.ingest.utils import start_ingest, stop_ingest
29
29
  from nucliadb.search import SERVICE_NAME
30
30
  from nucliadb.search.predict import start_predict_engine
@@ -61,6 +61,8 @@ async def lifespan(app: FastAPI):
61
61
  if get_utility(Utility.PREDICT):
62
62
  clean_utility(Utility.PREDICT)
63
63
 
64
+ await stop_nidx_utility()
65
+
64
66
  await finalize_utilities()
65
67
  await stop_audit_utility()
66
68
  await teardown_cluster()