nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/reader/app.py CHANGED
@@ -26,6 +26,7 @@ from starlette.middleware.authentication import AuthenticationMiddleware
26
26
  from starlette.requests import ClientDisconnect, Request
27
27
  from starlette.responses import HTMLResponse
28
28
 
29
+ from nucliadb.middleware import ClientErrorPayloadLoggerMiddleware
29
30
  from nucliadb.reader import API_PREFIX
30
31
  from nucliadb.reader.api.v1.router import api as api_v1
31
32
  from nucliadb.reader.lifecycle import lifespan
@@ -49,6 +50,7 @@ middleware.extend(
49
50
  backend=NucliaCloudAuthenticationBackend(),
50
51
  ),
51
52
  Middleware(AuditMiddleware, audit_utility_getter=get_audit),
53
+ Middleware(ClientErrorPayloadLoggerMiddleware),
52
54
  ]
53
55
  )
54
56
 
@@ -56,7 +58,6 @@ errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
56
58
 
57
59
  fastapi_settings = dict(
58
60
  debug=running_settings.debug,
59
- middleware=middleware,
60
61
  lifespan=lifespan,
61
62
  exception_handlers={
62
63
  Exception: global_exception_handler,
@@ -78,6 +79,7 @@ def create_application() -> FastAPI:
78
79
  prefix_format=f"/{API_PREFIX}/v{{major}}",
79
80
  default_version=(1, 0),
80
81
  enable_latest=False,
82
+ middleware=middleware,
81
83
  kwargs=fastapi_settings,
82
84
  )
83
85
 
@@ -21,7 +21,6 @@ import asyncio
21
21
  import contextlib
22
22
  import uuid
23
23
  from collections.abc import AsyncGenerator
24
- from typing import Optional
25
24
 
26
25
  import async_timeout
27
26
  from nats.aio.msg import Msg
@@ -200,7 +199,7 @@ async def get_resource_title_cached(
200
199
  return resource_title
201
200
 
202
201
 
203
- async def get_resource_title(kv_driver: Driver, kbid: str, resource_uuid: str) -> Optional[str]:
202
+ async def get_resource_title(kv_driver: Driver, kbid: str, resource_uuid: str) -> str | None:
204
203
  async with kv_driver.ro_transaction() as txn:
205
204
  basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=resource_uuid)
206
205
  if basic is None:
@@ -19,6 +19,7 @@
19
19
  #
20
20
  from . import ( # noqa: F401
21
21
  ask,
22
+ augment,
22
23
  catalog,
23
24
  feedback,
24
25
  find,
@@ -26,6 +27,7 @@ from . import ( # noqa: F401
26
27
  hydrate,
27
28
  knowledgebox,
28
29
  predict_proxy,
30
+ retrieve,
29
31
  search,
30
32
  suggest,
31
33
  summarize,
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import json
21
- from typing import Optional, Union
22
21
 
23
22
  from fastapi import Header, Request, Response
24
23
  from fastapi_versioning import version
@@ -67,7 +66,7 @@ async def ask_knowledgebox_endpoint(
67
66
  description="When set to true, outputs response as JSON in a non-streaming way. "
68
67
  "This is slower and requires waiting for entire answer to be ready.",
69
68
  ),
70
- ) -> Union[StreamingResponse, HTTPClientError, Response]:
69
+ ) -> StreamingResponse | HTTPClientError | Response:
71
70
  current_user: NucliaUser = request.user
72
71
  # If present, security groups from AuthorizationBackend overrides any
73
72
  # security group of the payload
@@ -116,8 +115,8 @@ async def create_ask_response(
116
115
  client_type: NucliaDBClientType,
117
116
  origin: str,
118
117
  x_synchronous: bool,
119
- resource: Optional[str] = None,
120
- extra_predict_headers: Optional[dict[str, str]] = None,
118
+ resource: str | None = None,
119
+ extra_predict_headers: dict[str, str] | None = None,
121
120
  ) -> Response:
122
121
  maybe_log_request_payload(kbid, "/ask", ask_request)
123
122
  ask_request.max_tokens = parse_max_tokens(ask_request.max_tokens)
@@ -0,0 +1,585 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ import asyncio
22
+ from typing import cast
23
+
24
+ from fastapi import Header, Request
25
+ from fastapi_versioning import version
26
+
27
+ from nucliadb.common.ids import FieldId, ParagraphId
28
+ from nucliadb.models.internal import augment as internal_augment
29
+ from nucliadb.models.internal.augment import (
30
+ Augment,
31
+ Augmented,
32
+ ConversationAnswerOrAfter,
33
+ ConversationAttachments,
34
+ ConversationAugment,
35
+ ConversationProp,
36
+ ConversationSelector,
37
+ ConversationText,
38
+ DeepResourceAugment,
39
+ FieldAugment,
40
+ FieldClassificationLabels,
41
+ FieldEntities,
42
+ FieldProp,
43
+ FieldText,
44
+ FileAugment,
45
+ FileProp,
46
+ FileThumbnail,
47
+ FullSelector,
48
+ MessageSelector,
49
+ Metadata,
50
+ Paragraph,
51
+ ParagraphAugment,
52
+ ParagraphImage,
53
+ ParagraphPage,
54
+ ParagraphPosition,
55
+ ParagraphProp,
56
+ ParagraphTable,
57
+ ParagraphText,
58
+ RelatedParagraphs,
59
+ ResourceAugment,
60
+ ResourceClassificationLabels,
61
+ ResourceProp,
62
+ ResourceSummary,
63
+ ResourceTitle,
64
+ WindowSelector,
65
+ )
66
+ from nucliadb.search.api.v1.router import KB_PREFIX, api
67
+ from nucliadb.search.augmentor import augmentor
68
+ from nucliadb.search.search.cache import request_caches
69
+ from nucliadb_models.augment import (
70
+ AugmentedConversationField,
71
+ AugmentedConversationMessage,
72
+ AugmentedField,
73
+ AugmentedFileField,
74
+ AugmentedParagraph,
75
+ AugmentedResource,
76
+ AugmentParagraphs,
77
+ AugmentRequest,
78
+ AugmentResources,
79
+ AugmentResponse,
80
+ )
81
+ from nucliadb_models.common import FieldTypeName
82
+ from nucliadb_models.resource import ExtractedDataTypeName, NucliaDBRoles
83
+ from nucliadb_models.search import NucliaDBClientType, ResourceProperties
84
+ from nucliadb_utils.authentication import requires
85
+
86
+
87
+ @api.post(
88
+ f"/{KB_PREFIX}/{{kbid}}/augment",
89
+ status_code=200,
90
+ description="Augment data on a Knowledge Box",
91
+ include_in_schema=False,
92
+ tags=["Augment"],
93
+ )
94
+ @requires(NucliaDBRoles.READER)
95
+ @version(1)
96
+ async def _augment_endpoint(
97
+ request: Request,
98
+ kbid: str,
99
+ item: AugmentRequest,
100
+ x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
101
+ x_nucliadb_user: str = Header(""),
102
+ x_forwarded_for: str = Header(""),
103
+ ) -> AugmentResponse:
104
+ return await augment_endpoint(kbid, item)
105
+
106
+
107
+ async def augment_endpoint(kbid: str, item: AugmentRequest) -> AugmentResponse:
108
+ augmentations = parse_first_augments(item)
109
+
110
+ if len(augmentations) == 0:
111
+ return AugmentResponse(resources={}, fields={}, paragraphs={})
112
+
113
+ with request_caches():
114
+ max_ops = asyncio.Semaphore(50)
115
+
116
+ first_augmented = await augmentor.augment(kbid, augmentations, concurrency_control=max_ops)
117
+ response = build_augment_response(item, first_augmented)
118
+
119
+ # 2nd round trip to augmentor
120
+ #
121
+ # There are some augmentations that require some augmented content to be
122
+ # able to keep augmenting, as neighbour paragraphs.
123
+ #
124
+ # However, as many data is already cached (when using cache), this
125
+ # second round should be orders of magnitude faster than the first round.
126
+ #
127
+ augmentations = parse_second_augments(item, first_augmented)
128
+ if len(augmentations) > 0:
129
+ second_augmented = await augmentor.augment(kbid, augmentations, concurrency_control=max_ops)
130
+ merge_second_augment(item, response, second_augmented)
131
+
132
+ return response
133
+
134
+
135
+ def parse_first_augments(item: AugmentRequest) -> list[Augment]:
136
+ """Parse an augment request and return a list of internal augments to
137
+ fulfill as much as the requested information as it can.
138
+
139
+ Notice there are augments that will require a 2nd round trip to the
140
+ augmentor, e.g., neighbouring paragraphs. This makes code a bit more
141
+ convoluted but avoids synchronization between augments, as many paragraphs
142
+ could lead to the same neighbours.
143
+
144
+ """
145
+ augmentations: list[Augment] = []
146
+
147
+ if item.resources is not None:
148
+ for resource_augment in item.resources:
149
+ show, extracted, resource_select = parse_deep_resource_augment(resource_augment)
150
+ if resource_augment.field_type_filter is None:
151
+ field_type_filter = list(FieldTypeName)
152
+ else:
153
+ field_type_filter = resource_augment.field_type_filter
154
+
155
+ if show:
156
+ augmentations.append(
157
+ DeepResourceAugment(
158
+ given=resource_augment.given,
159
+ show=show,
160
+ extracted=extracted,
161
+ field_type_filter=field_type_filter,
162
+ )
163
+ )
164
+ if resource_select:
165
+ augmentations.append(
166
+ ResourceAugment(
167
+ given=resource_augment.given, # type: ignore[arg-type]
168
+ select=resource_select,
169
+ )
170
+ )
171
+
172
+ if resource_augment.fields is not None:
173
+ # Augment resource fields with an optional field filter
174
+ field_select: list[FieldProp] = []
175
+ if resource_augment.fields.text:
176
+ field_select.append(FieldText())
177
+ if resource_augment.fields.classification_labels:
178
+ field_select.append(FieldClassificationLabels())
179
+
180
+ augmentations.append(
181
+ FieldAugment(
182
+ given=resource_augment.given, # type: ignore[arg-type]
183
+ select=field_select, # type: ignore[arg-type]
184
+ filter=resource_augment.fields.filters,
185
+ )
186
+ )
187
+
188
+ if item.fields is not None:
189
+ for field_augment in item.fields:
190
+ given = [FieldId.from_string(id) for id in field_augment.given]
191
+ select: list[FieldProp] = []
192
+ if field_augment.text:
193
+ select.append(FieldText())
194
+ if field_augment.entities:
195
+ select.append(FieldEntities())
196
+ if field_augment.classification_labels:
197
+ select.append(FieldClassificationLabels())
198
+
199
+ if len(select) > 0:
200
+ augmentations.append(
201
+ FieldAugment(
202
+ given=given,
203
+ select=select,
204
+ )
205
+ )
206
+
207
+ file_select: list[FileProp] = []
208
+ if field_augment.file_thumbnail:
209
+ file_select.append(FileThumbnail())
210
+
211
+ if len(file_select) > 0:
212
+ augmentations.append(
213
+ FileAugment(
214
+ given=given, # type: ignore
215
+ select=file_select,
216
+ )
217
+ )
218
+
219
+ conversation_select: list[ConversationProp] = []
220
+ selector: ConversationSelector
221
+
222
+ if field_augment.full_conversation:
223
+ selector = FullSelector()
224
+ conversation_select.append(ConversationText(selector=selector))
225
+ if (
226
+ field_augment.conversation_text_attachments
227
+ or field_augment.conversation_image_attachments
228
+ ):
229
+ conversation_select.append(ConversationAttachments(selector=selector))
230
+
231
+ elif field_augment.max_conversation_messages is not None:
232
+ # we want to always get the first conversation and the window
233
+ # requested by the user
234
+ first_selector = MessageSelector(index="first")
235
+ window_selector = WindowSelector(size=field_augment.max_conversation_messages)
236
+ conversation_select.append(ConversationText(selector=first_selector))
237
+ conversation_select.append(ConversationText(selector=window_selector))
238
+ if (
239
+ field_augment.conversation_text_attachments
240
+ or field_augment.conversation_image_attachments
241
+ ):
242
+ conversation_select.append(ConversationAttachments(selector=first_selector))
243
+ conversation_select.append(ConversationAttachments(selector=window_selector))
244
+
245
+ if field_augment.conversation_answer_or_messages_after:
246
+ conversation_select.append(ConversationAnswerOrAfter())
247
+
248
+ if len(conversation_select) > 0:
249
+ augmentations.append(
250
+ ConversationAugment(
251
+ given=given, # type: ignore
252
+ select=conversation_select,
253
+ )
254
+ )
255
+
256
+ if item.paragraphs is not None:
257
+ for paragraph_augment in item.paragraphs:
258
+ paragraphs_to_augment, paragraph_selector = parse_paragraph_augment(paragraph_augment)
259
+ augmentations.append(
260
+ ParagraphAugment(
261
+ given=paragraphs_to_augment,
262
+ select=paragraph_selector,
263
+ )
264
+ )
265
+
266
+ return augmentations
267
+
268
+
269
+ def parse_deep_resource_augment(
270
+ item: AugmentResources,
271
+ ) -> tuple[list[ResourceProperties], list[ExtractedDataTypeName], list[ResourceProp]]:
272
+ show = []
273
+ if item.basic:
274
+ show.append(ResourceProperties.BASIC)
275
+ if item.origin:
276
+ show.append(ResourceProperties.ORIGIN)
277
+ if item.extra:
278
+ show.append(ResourceProperties.EXTRA)
279
+ if item.relations:
280
+ show.append(ResourceProperties.RELATIONS)
281
+ if item.values:
282
+ show.append(ResourceProperties.VALUES)
283
+ if item.errors:
284
+ show.append(ResourceProperties.ERRORS)
285
+ if item.security:
286
+ show.append(ResourceProperties.SECURITY)
287
+
288
+ extracted = []
289
+ if item.extracted_text:
290
+ extracted.append(ExtractedDataTypeName.TEXT)
291
+ if item.extracted_metadata:
292
+ extracted.append(ExtractedDataTypeName.METADATA)
293
+ if item.extracted_shortened_metadata:
294
+ extracted.append(ExtractedDataTypeName.SHORTENED_METADATA)
295
+ if item.extracted_large_metadata:
296
+ extracted.append(ExtractedDataTypeName.LARGE_METADATA)
297
+ if item.extracted_vector:
298
+ extracted.append(ExtractedDataTypeName.VECTOR)
299
+ if item.extracted_link:
300
+ extracted.append(ExtractedDataTypeName.LINK)
301
+ if item.extracted_file:
302
+ extracted.append(ExtractedDataTypeName.FILE)
303
+ if item.extracted_qa:
304
+ extracted.append(ExtractedDataTypeName.QA)
305
+
306
+ if len(extracted) > 0:
307
+ show.append(ResourceProperties.EXTRACTED)
308
+
309
+ select: list[ResourceProp] = []
310
+ if item.title:
311
+ select.append(ResourceTitle())
312
+ if item.summary:
313
+ select.append(ResourceSummary())
314
+ if item.classification_labels:
315
+ select.append(ResourceClassificationLabels())
316
+
317
+ return (
318
+ show,
319
+ extracted,
320
+ select,
321
+ )
322
+
323
+
324
+ def parse_paragraph_augment(item: AugmentParagraphs) -> tuple[list[Paragraph], list[ParagraphProp]]:
325
+ paragraphs_to_augment = []
326
+ for paragraph in item.given:
327
+ try:
328
+ paragraph_id = ParagraphId.from_string(paragraph.id)
329
+ except ValueError:
330
+ # invalid paragraph id, skipping
331
+ continue
332
+
333
+ if paragraph.metadata is None:
334
+ metadata = None
335
+ else:
336
+ metadata = Metadata(
337
+ is_an_image=paragraph.metadata.is_an_image,
338
+ is_a_table=paragraph.metadata.is_a_table,
339
+ source_file=paragraph.metadata.source_file,
340
+ page=paragraph.metadata.page,
341
+ in_page_with_visual=paragraph.metadata.in_page_with_visual,
342
+ )
343
+
344
+ paragraphs_to_augment.append(Paragraph(id=paragraph_id, metadata=metadata))
345
+
346
+ selector: list[ParagraphProp] = []
347
+ if item.text:
348
+ selector.append(ParagraphText())
349
+ if item.neighbours_before or item.neighbours_after:
350
+ selector.append(
351
+ RelatedParagraphs(
352
+ neighbours_before=item.neighbours_before or 0,
353
+ neighbours_after=item.neighbours_after or 0,
354
+ )
355
+ )
356
+ if item.source_image:
357
+ selector.append(ParagraphImage())
358
+ if item.table_image:
359
+ selector.append(ParagraphTable(prefer_page_preview=item.table_prefers_page_preview))
360
+ if item.page_preview_image:
361
+ selector.append(ParagraphPage(preview=True))
362
+
363
+ return paragraphs_to_augment, selector
364
+
365
+
366
+ def build_augment_response(item: AugmentRequest, augmented: Augmented) -> AugmentResponse:
367
+ response = AugmentResponse(
368
+ resources={},
369
+ fields={},
370
+ paragraphs={},
371
+ )
372
+
373
+ # start with deep resources, as they return a Resource object we can merge
374
+ # with the augmented model
375
+ for rid, resource_deep in augmented.resources_deep.items():
376
+ if resource_deep is None:
377
+ continue
378
+
379
+ augmented_resource = AugmentedResource(id=rid)
380
+ augmented_resource.updated_from(resource_deep)
381
+ response.resources[rid] = augmented_resource
382
+
383
+ # now we can cherry pick properties from the augmented resources and merge
384
+ # them with the deep ones
385
+ for rid, resource in augmented.resources.items():
386
+ if resource is None:
387
+ continue
388
+
389
+ augmented_resource = response.resources.setdefault(rid, AugmentedResource(id=rid))
390
+
391
+ # merge resource with deep resources without overwriting
392
+ augmented_resource.title = augmented_resource.title or resource.title
393
+ augmented_resource.summary = augmented_resource.summary or resource.summary
394
+
395
+ # properties original to the augmented resources (not in deep resource augment)
396
+ if resource.classification_labels is not None:
397
+ augmented_resource.classification_labels = {
398
+ labelset: list(labels) for labelset, labels in resource.classification_labels.items()
399
+ }
400
+
401
+ for field_id, field in augmented.fields.items():
402
+ if field is None:
403
+ continue
404
+
405
+ # common augments for all fields
406
+
407
+ if field.classification_labels is None:
408
+ classification_labels = None
409
+ else:
410
+ classification_labels = {
411
+ labelset: list(labels) for labelset, labels in field.classification_labels.items()
412
+ }
413
+
414
+ if field.entities is None:
415
+ entities = None
416
+ else:
417
+ entities = {family: list(entity) for family, entity in field.entities.items()}
418
+
419
+ if field_id.type in (
420
+ FieldTypeName.TEXT.abbreviation(),
421
+ FieldTypeName.LINK.abbreviation(),
422
+ FieldTypeName.GENERIC.abbreviation(),
423
+ ):
424
+ response.fields[field_id.full()] = AugmentedField(
425
+ text=field.text, # type: ignore # field is instance of any of the above and has the text property
426
+ classification_labels=classification_labels,
427
+ entities=entities,
428
+ )
429
+
430
+ elif field_id.type == FieldTypeName.FILE.abbreviation():
431
+ field = cast(internal_augment.AugmentedFileField, field)
432
+ response.fields[field_id.full()] = AugmentedFileField(
433
+ text=field.text, # type: ignore # field is instance of any of the above and has the text property
434
+ classification_labels=classification_labels,
435
+ entities=entities,
436
+ thumbnail_image=field.thumbnail_path,
437
+ )
438
+
439
+ elif field_id.type == FieldTypeName.CONVERSATION.abbreviation():
440
+ field = cast(internal_augment.AugmentedConversationField, field)
441
+ conversation = AugmentedConversationField(
442
+ classification_labels=classification_labels,
443
+ entities=entities,
444
+ )
445
+
446
+ if field.messages is not None:
447
+ conversation.messages = []
448
+ for m in field.messages:
449
+ if m.attachments is None:
450
+ attachments = None
451
+ else:
452
+ attachments = []
453
+ for f in m.attachments:
454
+ attachments.append(f.full())
455
+
456
+ conversation.messages.append(
457
+ AugmentedConversationMessage(
458
+ ident=m.ident,
459
+ text=m.text,
460
+ attachments=attachments,
461
+ )
462
+ )
463
+
464
+ response.fields[field_id.full()] = conversation
465
+
466
+ else: # pragma: no cover
467
+ assert False, f"unknown field type: {field_id.type}"
468
+
469
+ for paragraph_id, paragraph in augmented.paragraphs.items():
470
+ if paragraph is None:
471
+ continue
472
+
473
+ augmented_paragraph = AugmentedParagraph()
474
+ augmented_paragraph.text = paragraph.text
475
+ if paragraph.related is not None:
476
+ augmented_paragraph.neighbours_before = list(
477
+ map(lambda x: x.full(), paragraph.related.neighbours_before)
478
+ )
479
+ augmented_paragraph.neighbours_after = list(
480
+ map(lambda x: x.full(), paragraph.related.neighbours_after)
481
+ )
482
+ augmented_paragraph.source_image = paragraph.source_image_path
483
+ augmented_paragraph.table_image = paragraph.table_image_path
484
+ augmented_paragraph.page_preview_image = paragraph.page_preview_path
485
+ response.paragraphs[paragraph_id.full()] = augmented_paragraph
486
+
487
+ return response
488
+
489
+
490
+ def parse_second_augments(item: AugmentRequest, augmented: Augmented) -> list[Augment]:
491
+ """Given an augment request an a first augmentation, return a list of
492
+ augments required to fulfill the requested data.
493
+
494
+ """
495
+ augmentations: list[Augment] = []
496
+
497
+ for paragraph_augment in item.paragraphs or []:
498
+ if paragraph_augment.neighbours_before or paragraph_augment.neighbours_after:
499
+ neighbours = []
500
+ for paragraph_id, paragraph in augmented.paragraphs.items():
501
+ if paragraph.related is not None:
502
+ for neighbour_before in paragraph.related.neighbours_before:
503
+ neighbours.append(Paragraph(id=neighbour_before, metadata=None))
504
+ for neighbour_after in paragraph.related.neighbours_after:
505
+ neighbours.append(Paragraph(id=neighbour_after, metadata=None))
506
+
507
+ if neighbours:
508
+ augmentations.append(
509
+ ParagraphAugment(
510
+ given=neighbours,
511
+ select=[
512
+ ParagraphText(),
513
+ ParagraphPosition(),
514
+ ],
515
+ )
516
+ )
517
+
518
+ return augmentations
519
+
520
+
521
+ def merge_second_augment(item: AugmentRequest, response: AugmentResponse, augmented: Augmented):
522
+ """Merge in-place augmented data with an existing augment response."""
523
+
524
+ if any(
525
+ (
526
+ paragraph_augment.neighbours_before or paragraph_augment.neighbours_after
527
+ for paragraph_augment in item.paragraphs or []
528
+ )
529
+ ):
530
+ # neighbour paragraphs
531
+
532
+ new_paragraphs = {}
533
+ for paragraph_id_str, augmented_paragraph in response.paragraphs.items():
534
+ before_refs = []
535
+ for before_id_str in augmented_paragraph.neighbours_before or []:
536
+ before_id = ParagraphId.from_string(before_id_str)
537
+
538
+ if before_id not in augmented.paragraphs:
539
+ continue
540
+ neighbour = augmented.paragraphs[before_id]
541
+
542
+ if before_id_str not in response.paragraphs:
543
+ if not neighbour.text and not neighbour.position:
544
+ continue
545
+ # create a new paragraph for the neighbour
546
+ new_paragraphs[before_id_str] = AugmentedParagraph(
547
+ text=neighbour.text, position=neighbour.position
548
+ )
549
+
550
+ else:
551
+ # merge neighbour with existing paragraph
552
+ if not response.paragraphs[before_id_str].text:
553
+ response.paragraphs[before_id_str].text = neighbour.text
554
+
555
+ before_refs.append(before_id_str)
556
+
557
+ after_refs = []
558
+ for after_id_str in augmented_paragraph.neighbours_after or []:
559
+ after_id = ParagraphId.from_string(after_id_str)
560
+
561
+ if after_id not in augmented.paragraphs:
562
+ continue
563
+ neighbour = augmented.paragraphs[after_id]
564
+
565
+ if after_id_str not in response.paragraphs:
566
+ if not neighbour.text and not neighbour.position:
567
+ continue
568
+ # create a new paragraph for the neighbour
569
+ new_paragraphs[after_id_str] = AugmentedParagraph(
570
+ text=neighbour.text, position=neighbour.position
571
+ )
572
+
573
+ else:
574
+ # merge neighbour with existing paragraph
575
+ if not response.paragraphs[after_id_str].text:
576
+ response.paragraphs[after_id_str].text = neighbour.text
577
+
578
+ after_refs.append(after_id_str)
579
+
580
+ # update references to contain only the neighbours that existed in
581
+ # the response or we added
582
+ augmented_paragraph.neighbours_before = before_refs
583
+ augmented_paragraph.neighbours_after = after_refs
584
+
585
+ response.paragraphs.update(new_paragraphs)