nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,49 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from pydantic import BaseModel
21
+
22
+ from nucliadb_models.common import FieldTypeName
23
+ from nucliadb_models.resource import ExtractedDataTypeName
24
+ from nucliadb_models.search import ResourceProperties
25
+
26
+
27
+ class ResourceHydrationOptions(BaseModel):
28
+ """
29
+ Options for hydrating resources.
30
+ """
31
+
32
+ show: list[ResourceProperties] = []
33
+ extracted: list[ExtractedDataTypeName] = []
34
+ field_type_filter: list[FieldTypeName] = []
35
+
36
+
37
+ class TextBlockHydrationOptions(BaseModel):
38
+ """
39
+ Options for hydrating text blocks (aka paragraphs).
40
+ """
41
+
42
+ # whether to highlight the text block with `<mark>...</mark>` tags or not
43
+ highlight: bool = False
44
+
45
+ # list of exact matches to highlight
46
+ ematches: list[str] | None = None
47
+
48
+ # If true, only hydrate the text block if its text is not already populated
49
+ only_hydrate_empty: bool = False
@@ -0,0 +1,217 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from typing import cast
21
+
22
+ from typing_extensions import assert_never
23
+
24
+ from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId
25
+ from nucliadb.ingest.fields.base import Field
26
+ from nucliadb.ingest.fields.conversation import Conversation
27
+ from nucliadb.ingest.fields.file import File
28
+ from nucliadb.ingest.fields.generic import Generic
29
+ from nucliadb.ingest.fields.link import Link
30
+ from nucliadb.ingest.fields.text import Text
31
+ from nucliadb.models.internal.augment import ConversationProp, FieldProp, FieldText, FieldValue
32
+ from nucliadb.search.augmentor.fields import (
33
+ db_augment_conversation_field,
34
+ db_augment_file_field,
35
+ db_augment_generic_field,
36
+ db_augment_link_field,
37
+ db_augment_text_field,
38
+ )
39
+ from nucliadb_models import hydration as hydration_models
40
+ from nucliadb_models.common import FieldTypeName
41
+
42
+
43
+ def page_preview_id(page_number: int) -> str:
44
+ """Return the string page number for an specific page"""
45
+ return f"{page_number}"
46
+
47
+
48
+ async def hydrate_field(field: Field, field_id: FieldId, config: hydration_models.FieldHydration):
49
+ field_type = FIELD_TYPE_STR_TO_NAME[field_id.type]
50
+
51
+ if field_type == FieldTypeName.TEXT:
52
+ if not config.text is not None:
53
+ return
54
+ field = cast(Text, field)
55
+ return await hydrate_text_field(field, field_id, config.text)
56
+
57
+ elif field_type == FieldTypeName.FILE is not None:
58
+ if not config.file:
59
+ return
60
+ field = cast(File, field)
61
+ return await hydrate_file_field(field, field_id, config.file)
62
+
63
+ elif field_type == FieldTypeName.LINK is not None:
64
+ if not config.link:
65
+ return
66
+ field = cast(Link, field)
67
+ return await hydrate_link_field(field, field_id, config.link)
68
+
69
+ elif field_type == FieldTypeName.CONVERSATION is not None:
70
+ if not config.conversation:
71
+ return
72
+ field = cast(Conversation, field)
73
+ return await hydrate_conversation_field(field, field_id, config.conversation)
74
+
75
+ elif field_type == FieldTypeName.GENERIC is not None:
76
+ if not config.generic:
77
+ return
78
+ field = cast(Generic, field)
79
+ return await hydrate_generic_field(field, field_id, config.generic)
80
+
81
+ else: # pragma: no cover
82
+ assert_never(field_type)
83
+
84
+
85
+ async def hydrate_text_field(
86
+ field: Text,
87
+ field_id: FieldId,
88
+ config: hydration_models.TextFieldHydration,
89
+ ) -> hydration_models.HydratedTextField:
90
+ select: list[FieldProp] = []
91
+ if config.value:
92
+ select.append(FieldValue())
93
+ if config.extracted_text:
94
+ select.append(FieldText())
95
+
96
+ augmented = await db_augment_text_field(field, field_id, select)
97
+
98
+ hydrated = hydration_models.HydratedTextField(
99
+ id=field_id.full(),
100
+ resource=field_id.rid,
101
+ field_type=FieldTypeName.TEXT,
102
+ )
103
+
104
+ if config.value and augmented.value:
105
+ hydrated.value = augmented.value
106
+
107
+ if config.extracted_text and augmented.text:
108
+ hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
109
+
110
+ return hydrated
111
+
112
+
113
+ async def hydrate_file_field(
114
+ field: File,
115
+ field_id: FieldId,
116
+ config: hydration_models.FileFieldHydration,
117
+ ) -> hydration_models.HydratedFileField:
118
+ select: list[FieldProp] = []
119
+ if config.value:
120
+ select.append(FieldValue())
121
+ if config.extracted_text:
122
+ select.append(FieldText())
123
+
124
+ augmented = await db_augment_file_field(field, field_id, select)
125
+
126
+ hydrated = hydration_models.HydratedFileField(
127
+ id=field_id.full(),
128
+ resource=field_id.rid,
129
+ field_type=FieldTypeName.FILE,
130
+ )
131
+
132
+ if config.value and augmented.value:
133
+ hydrated.value = augmented.value
134
+
135
+ if config.extracted_text and augmented.text:
136
+ hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
137
+
138
+ return hydrated
139
+
140
+
141
+ async def hydrate_link_field(
142
+ field: Link,
143
+ field_id: FieldId,
144
+ config: hydration_models.LinkFieldHydration,
145
+ ) -> hydration_models.HydratedLinkField:
146
+ select: list[FieldProp] = []
147
+ if config.value:
148
+ select.append(FieldValue())
149
+ if config.extracted_text:
150
+ select.append(FieldText())
151
+
152
+ augmented = await db_augment_link_field(field, field_id, select)
153
+
154
+ hydrated = hydration_models.HydratedLinkField(
155
+ id=field_id.full(),
156
+ resource=field_id.rid,
157
+ field_type=FieldTypeName.LINK,
158
+ )
159
+
160
+ if config.value and augmented.value:
161
+ hydrated.value = augmented.value
162
+
163
+ if config.extracted_text and augmented.text:
164
+ hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
165
+
166
+ return hydrated
167
+
168
+
169
+ async def hydrate_conversation_field(
170
+ field: Conversation,
171
+ field_id: FieldId,
172
+ config: hydration_models.ConversationFieldHydration,
173
+ ) -> hydration_models.HydratedConversationField:
174
+ select: list[ConversationProp] = []
175
+ if config.value:
176
+ select.append(FieldValue())
177
+
178
+ augmented = await db_augment_conversation_field(field, field_id, select)
179
+
180
+ hydrated = hydration_models.HydratedConversationField(
181
+ id=field_id.full(),
182
+ resource=field_id.rid,
183
+ field_type=FieldTypeName.CONVERSATION,
184
+ )
185
+
186
+ if config.value and augmented.value:
187
+ hydrated.value = augmented.value
188
+
189
+ return hydrated
190
+
191
+
192
+ async def hydrate_generic_field(
193
+ field: Generic,
194
+ field_id: FieldId,
195
+ config: hydration_models.GenericFieldHydration,
196
+ ) -> hydration_models.HydratedGenericField:
197
+ select: list[FieldProp] = []
198
+ if config.value:
199
+ select.append(FieldValue())
200
+ if config.extracted_text:
201
+ select.append(FieldText())
202
+
203
+ augmented = await db_augment_generic_field(field, field_id, select)
204
+
205
+ hydrated = hydration_models.HydratedGenericField(
206
+ id=field_id.full(),
207
+ resource=field_id.rid,
208
+ field_type=FieldTypeName.GENERIC,
209
+ )
210
+
211
+ if config.value and augmented.value:
212
+ hydrated.value = augmented.value
213
+
214
+ if config.extracted_text and augmented.text:
215
+ hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
216
+
217
+ return hydrated
@@ -0,0 +1,130 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import base64
21
+ from typing import cast
22
+
23
+ from typing_extensions import assert_never
24
+
25
+ from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId, ParagraphId
26
+ from nucliadb.ingest.fields.base import Field
27
+ from nucliadb.ingest.fields.file import File
28
+ from nucliadb.search import SERVICE_NAME
29
+ from nucliadb_models.common import FieldTypeName
30
+ from nucliadb_models.search import Image
31
+ from nucliadb_protos import resources_pb2
32
+ from nucliadb_utils.utilities import get_storage
33
+
34
+
35
+ async def paragraph_source_image(
36
+ kbid: str, paragraph_id: ParagraphId, paragraph: resources_pb2.Paragraph
37
+ ) -> Image | None:
38
+ """Certain paragraphs are extracted from images using techniques like OCR or
39
+ inception. If that's the case, return the original image for this paragraph.
40
+
41
+ """
42
+ source_image = paragraph.representation.reference_file
43
+ if not source_image:
44
+ return None
45
+
46
+ if paragraph.kind not in (
47
+ resources_pb2.Paragraph.TypeParagraph.OCR,
48
+ resources_pb2.Paragraph.TypeParagraph.INCEPTION,
49
+ ):
50
+ return None
51
+
52
+ field_id = paragraph_id.field_id
53
+
54
+ # Paragraphs extracted from an image store its original image representation
55
+ # in the reference file. The path is incomplete though, as it's stored in
56
+ # the `generated` folder
57
+ image = await download_image(
58
+ kbid,
59
+ field_id,
60
+ f"generated/{source_image}",
61
+ # XXX: we assume all reference files are PNG images, but this actually
62
+ # depends on learning so it's a dangerous assumption. We should check it
63
+ # by ourselves
64
+ mime_type="image/png",
65
+ )
66
+ return image
67
+
68
+
69
+ async def download_image(
70
+ kbid: str, field_id: FieldId, image_path: str, *, mime_type: str
71
+ ) -> Image | None:
72
+ storage = await get_storage(service_name=SERVICE_NAME)
73
+ sf = storage.file_extracted(
74
+ kbid,
75
+ field_id.rid,
76
+ field_id.type,
77
+ field_id.key,
78
+ image_path,
79
+ )
80
+ raw_image = (await storage.downloadbytes(sf.bucket, sf.key)).getvalue()
81
+ if not raw_image:
82
+ return None
83
+ return Image(content_type=mime_type, b64encoded=base64.b64encode(raw_image).decode())
84
+
85
+
86
+ async def download_page_preview(field: Field, page: int) -> Image | None:
87
+ """Download a specific page preview for a field and return it as an Image.
88
+ As not all fields have previews, this function can return None.
89
+
90
+ Page previews are uploaded by learning and shared through a known path with.
91
+ nucliadb
92
+
93
+ """
94
+ field_type = FIELD_TYPE_STR_TO_NAME[field.type]
95
+
96
+ if field_type == FieldTypeName.FILE:
97
+ field = cast(File, field)
98
+ metadata = await field.get_file_extracted_data()
99
+
100
+ if metadata is None:
101
+ return None
102
+
103
+ assert page <= len(metadata.file_pages_previews.positions), (
104
+ f"paragraph page number {page} should be less or equal to the total file pages previews {len(metadata.file_pages_previews.positions)}"
105
+ )
106
+ image = await download_image(
107
+ field.kbid,
108
+ field.field_id,
109
+ f"generated/extracted_images_{page}.png",
110
+ mime_type="image/png",
111
+ )
112
+
113
+ elif field_type == FieldTypeName.LINK:
114
+ # TODO: in case of links, we want to return the link preview, that is a
115
+ # link converted to PDF and screenshotted
116
+ # REVIEW: link preview is an image or a PDF?
117
+ image = None
118
+
119
+ elif (
120
+ field_type == FieldTypeName.TEXT
121
+ or field_type == FieldTypeName.CONVERSATION
122
+ or field_type == FieldTypeName.GENERIC
123
+ ):
124
+ # these fields don't have previews
125
+ image = None
126
+
127
+ else: # pragma: no cover
128
+ assert_never(field_type)
129
+
130
+ return image