nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -1,129 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
-
21
- from typing import cast
22
-
23
- from nidx_protos.noderesources_pb2 import Resource as IndexMessage
24
-
25
- from nucliadb.common.maindb.driver import Transaction
26
- from nucliadb.common.maindb.pg import PGDriver, PGTransaction
27
- from nucliadb.common.maindb.utils import get_driver
28
- from nucliadb_telemetry import metrics
29
-
30
- from ..resource import Resource
31
-
32
- observer = metrics.Observer("pg_catalog_write", labels={"type": ""})
33
-
34
-
35
- def _pg_transaction(txn: Transaction) -> PGTransaction:
36
- return cast(PGTransaction, txn)
37
-
38
-
39
- def pgcatalog_enabled(kbid):
40
- return isinstance(get_driver(), PGDriver)
41
-
42
-
43
- def extract_facets(labels):
44
- facets = set()
45
- for label in labels:
46
- parts = label.split("/")
47
- facet = ""
48
- for part in parts[1:]:
49
- facet += f"/{part}"
50
- facets.add(facet)
51
- return facets
52
-
53
-
54
- @observer.wrap({"type": "update"})
55
- async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
56
- if not pgcatalog_enabled(kbid):
57
- return
58
-
59
- if resource.basic is None:
60
- raise ValueError("Cannot index into the catalog a resource without basic metadata ")
61
-
62
- created_at = resource.basic.created.ToDatetime()
63
- modified_at = resource.basic.modified.ToDatetime()
64
- if modified_at < created_at:
65
- modified_at = created_at
66
-
67
- async with _pg_transaction(txn).connection.cursor() as cur:
68
- # Do not index canceled labels
69
- cancelled_labels = {
70
- f"/l/{clf.labelset}/{clf.label}"
71
- for clf in resource.basic.usermetadata.classifications
72
- if clf.cancelled_by_user
73
- }
74
-
75
- # Labels from the resource and classification labels from each field
76
- labels = [label for label in index_message.labels]
77
- for classification in resource.basic.computedmetadata.field_classifications:
78
- for clf in classification.classifications:
79
- label = f"/l/{clf.labelset}/{clf.label}"
80
- if label not in cancelled_labels:
81
- labels.append(label)
82
-
83
- await cur.execute(
84
- """
85
- INSERT INTO catalog
86
- (kbid, rid, title, created_at, modified_at, labels, slug)
87
- VALUES
88
- (%(kbid)s, %(rid)s, %(title)s, %(created_at)s, %(modified_at)s, %(labels)s, %(slug)s)
89
- ON CONFLICT (kbid, rid) DO UPDATE SET
90
- title = excluded.title,
91
- created_at = excluded.created_at,
92
- modified_at = excluded.modified_at,
93
- labels = excluded.labels,
94
- slug = excluded.slug""",
95
- {
96
- "kbid": resource.kb.kbid,
97
- "rid": resource.uuid,
98
- "title": resource.basic.title,
99
- "created_at": created_at,
100
- "modified_at": modified_at,
101
- "labels": labels,
102
- "slug": resource.basic.slug,
103
- },
104
- )
105
- await cur.execute(
106
- "DELETE FROM catalog_facets WHERE kbid = %(kbid)s AND rid = %(rid)s",
107
- {
108
- "kbid": resource.kb.kbid,
109
- "rid": resource.uuid,
110
- },
111
- )
112
- await cur.execute(
113
- "INSERT INTO catalog_facets (kbid, rid, facet) SELECT %(kbid)s AS kbid, %(rid)s AS rid, unnest(%(facets)s::text[]) AS facet",
114
- {
115
- "kbid": resource.kb.kbid,
116
- "rid": resource.uuid,
117
- "facets": list(extract_facets(labels)),
118
- },
119
- )
120
-
121
-
122
- @observer.wrap({"type": "delete"})
123
- async def pgcatalog_delete(txn: Transaction, kbid: str, rid: str):
124
- if not pgcatalog_enabled(kbid):
125
- return
126
- async with _pg_transaction(txn).connection.cursor() as cur:
127
- await cur.execute(
128
- "DELETE FROM catalog where kbid = %(kbid)s AND rid = %(rid)s", {"kbid": kbid, "rid": rid}
129
- )
@@ -1,197 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- import asyncio
21
- import logging
22
- from contextlib import AsyncExitStack
23
- from typing import Optional
24
-
25
- from pydantic import BaseModel
26
-
27
- from nucliadb.common.external_index_providers.base import TextBlockMatch
28
- from nucliadb.common.ids import FieldId
29
- from nucliadb.common.maindb.utils import get_driver
30
- from nucliadb.ingest.serialize import managed_serialize
31
- from nucliadb.search.search import cache, paragraphs
32
- from nucliadb_models.common import FieldTypeName
33
- from nucliadb_models.resource import ExtractedDataTypeName, Resource
34
- from nucliadb_models.search import (
35
- FindParagraph,
36
- ResourceProperties,
37
- )
38
- from nucliadb_telemetry.metrics import Observer
39
- from nucliadb_utils import const
40
- from nucliadb_utils.asyncio_utils import ConcurrentRunner
41
- from nucliadb_utils.utilities import has_feature
42
-
43
- logger = logging.getLogger(__name__)
44
-
45
- hydrator_observer = Observer("hydrator", labels={"type": ""})
46
-
47
-
48
- class ResourceHydrationOptions(BaseModel):
49
- """
50
- Options for hydrating resources.
51
- """
52
-
53
- show: list[ResourceProperties] = []
54
- extracted: list[ExtractedDataTypeName] = []
55
- field_type_filter: list[FieldTypeName] = []
56
-
57
-
58
- class TextBlockHydrationOptions(BaseModel):
59
- """
60
- Options for hydrating text blocks (aka paragraphs).
61
- """
62
-
63
- # whether to highlight the text block with `<mark>...</mark>` tags or not
64
- highlight: bool = False
65
-
66
- # list of exact matches to highlight
67
- ematches: Optional[list[str]] = None
68
-
69
- # If true, only hydrate the text block if its text is not already populated
70
- only_hydrate_empty: bool = False
71
-
72
-
73
- @hydrator_observer.wrap({"type": "resource_text"})
74
- async def hydrate_resource_text(
75
- kbid: str, rid: str, *, max_concurrent_tasks: int
76
- ) -> list[tuple[FieldId, str]]:
77
- resource = await cache.get_resource(kbid, rid)
78
- if resource is None: # pragma: no cover
79
- return []
80
-
81
- # Schedule the extraction of the text of each field in the resource
82
- async with get_driver().ro_transaction() as txn:
83
- resource.txn = txn
84
- runner = ConcurrentRunner(max_tasks=max_concurrent_tasks)
85
- for field_type, field_key in await resource.get_fields(force=True):
86
- field_id = FieldId.from_pb(rid, field_type, field_key)
87
- runner.schedule(hydrate_field_text(kbid, field_id))
88
-
89
- # Include the summary aswell
90
- runner.schedule(hydrate_field_text(kbid, FieldId(rid=rid, type="a", key="summary")))
91
-
92
- # Wait for the results
93
- field_extracted_texts = await runner.wait()
94
-
95
- return [text for text in field_extracted_texts if text is not None]
96
-
97
-
98
- @hydrator_observer.wrap({"type": "resource_metadata"})
99
- async def hydrate_resource_metadata(
100
- kbid: str,
101
- resource_id: str,
102
- options: ResourceHydrationOptions,
103
- *,
104
- concurrency_control: Optional[asyncio.Semaphore] = None,
105
- service_name: Optional[str] = None,
106
- ) -> Optional[Resource]:
107
- """Fetch resource metadata and return it serialized."""
108
- show = options.show
109
- extracted = options.extracted
110
-
111
- if ResourceProperties.EXTRACTED in show and has_feature(
112
- const.Features.IGNORE_EXTRACTED_IN_SEARCH, context={"kbid": kbid}, default=False
113
- ):
114
- # Returning extracted metadata in search results is deprecated and this flag
115
- # will be set to True for all KBs in the future.
116
- show.remove(ResourceProperties.EXTRACTED)
117
- extracted = []
118
-
119
- async with AsyncExitStack() as stack:
120
- if concurrency_control is not None:
121
- await stack.enter_async_context(concurrency_control)
122
-
123
- async with get_driver().ro_transaction() as ro_txn:
124
- serialized_resource = await managed_serialize(
125
- txn=ro_txn,
126
- kbid=kbid,
127
- rid=resource_id,
128
- show=show,
129
- field_type_filter=options.field_type_filter,
130
- extracted=extracted,
131
- service_name=service_name,
132
- )
133
- if serialized_resource is None:
134
- logger.warning(
135
- "Resource not found in database", extra={"kbid": kbid, "rid": resource_id}
136
- )
137
- return serialized_resource
138
-
139
-
140
- @hydrator_observer.wrap({"type": "field_text"})
141
- async def hydrate_field_text(
142
- kbid: str,
143
- field_id: FieldId,
144
- ) -> Optional[tuple[FieldId, str]]:
145
- extracted_text_pb = await cache.get_extracted_text_from_field_id(kbid, field_id)
146
- if extracted_text_pb is None: # pragma: no cover
147
- return None
148
-
149
- if field_id.subfield_id:
150
- return field_id, extracted_text_pb.split_text[field_id.subfield_id]
151
- else:
152
- return field_id, extracted_text_pb.text
153
-
154
-
155
- @hydrator_observer.wrap({"type": "text_block"})
156
- async def hydrate_text_block(
157
- kbid: str,
158
- text_block: TextBlockMatch,
159
- options: TextBlockHydrationOptions,
160
- *,
161
- concurrency_control: Optional[asyncio.Semaphore] = None,
162
- ) -> TextBlockMatch:
163
- """Given a `text_block`, fetch its corresponding text, modify and return the
164
- `text_block` object.
165
-
166
- """
167
- if options.only_hydrate_empty and text_block.text:
168
- return text_block
169
- async with AsyncExitStack() as stack:
170
- if concurrency_control is not None:
171
- await stack.enter_async_context(concurrency_control)
172
-
173
- text_block.text = await paragraphs.get_paragraph_text(
174
- kbid=kbid,
175
- paragraph_id=text_block.paragraph_id,
176
- highlight=options.highlight,
177
- matches=[], # TODO: this was never implemented
178
- ematches=options.ematches,
179
- )
180
- return text_block
181
-
182
-
183
- def text_block_to_find_paragraph(text_block: TextBlockMatch) -> FindParagraph:
184
- return FindParagraph(
185
- id=text_block.paragraph_id.full(),
186
- text=text_block.text or "",
187
- score=text_block.score,
188
- score_type=text_block.score_type,
189
- order=text_block.order,
190
- labels=text_block.paragraph_labels,
191
- fuzzy_result=text_block.fuzzy_search,
192
- is_a_table=text_block.is_a_table,
193
- reference=text_block.representation_file,
194
- page_with_visual=text_block.page_with_visual,
195
- position=text_block.position,
196
- relevant_relations=text_block.relevant_relations,
197
- )