nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/reader/app.py
CHANGED
|
@@ -26,6 +26,7 @@ from starlette.middleware.authentication import AuthenticationMiddleware
|
|
|
26
26
|
from starlette.requests import ClientDisconnect, Request
|
|
27
27
|
from starlette.responses import HTMLResponse
|
|
28
28
|
|
|
29
|
+
from nucliadb.middleware import ClientErrorPayloadLoggerMiddleware
|
|
29
30
|
from nucliadb.reader import API_PREFIX
|
|
30
31
|
from nucliadb.reader.api.v1.router import api as api_v1
|
|
31
32
|
from nucliadb.reader.lifecycle import lifespan
|
|
@@ -49,6 +50,7 @@ middleware.extend(
|
|
|
49
50
|
backend=NucliaCloudAuthenticationBackend(),
|
|
50
51
|
),
|
|
51
52
|
Middleware(AuditMiddleware, audit_utility_getter=get_audit),
|
|
53
|
+
Middleware(ClientErrorPayloadLoggerMiddleware),
|
|
52
54
|
]
|
|
53
55
|
)
|
|
54
56
|
|
|
@@ -56,7 +58,6 @@ errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
|
|
|
56
58
|
|
|
57
59
|
fastapi_settings = dict(
|
|
58
60
|
debug=running_settings.debug,
|
|
59
|
-
middleware=middleware,
|
|
60
61
|
lifespan=lifespan,
|
|
61
62
|
exception_handlers={
|
|
62
63
|
Exception: global_exception_handler,
|
|
@@ -78,6 +79,7 @@ def create_application() -> FastAPI:
|
|
|
78
79
|
prefix_format=f"/{API_PREFIX}/v{{major}}",
|
|
79
80
|
default_version=(1, 0),
|
|
80
81
|
enable_latest=False,
|
|
82
|
+
middleware=middleware,
|
|
81
83
|
kwargs=fastapi_settings,
|
|
82
84
|
)
|
|
83
85
|
|
|
@@ -21,7 +21,6 @@ import asyncio
|
|
|
21
21
|
import contextlib
|
|
22
22
|
import uuid
|
|
23
23
|
from collections.abc import AsyncGenerator
|
|
24
|
-
from typing import Optional
|
|
25
24
|
|
|
26
25
|
import async_timeout
|
|
27
26
|
from nats.aio.msg import Msg
|
|
@@ -200,7 +199,7 @@ async def get_resource_title_cached(
|
|
|
200
199
|
return resource_title
|
|
201
200
|
|
|
202
201
|
|
|
203
|
-
async def get_resource_title(kv_driver: Driver, kbid: str, resource_uuid: str) ->
|
|
202
|
+
async def get_resource_title(kv_driver: Driver, kbid: str, resource_uuid: str) -> str | None:
|
|
204
203
|
async with kv_driver.ro_transaction() as txn:
|
|
205
204
|
basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=resource_uuid)
|
|
206
205
|
if basic is None:
|
nucliadb/search/api/v1/ask.py
CHANGED
|
@@ -18,7 +18,6 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import json
|
|
21
|
-
from typing import Optional, Union
|
|
22
21
|
|
|
23
22
|
from fastapi import Header, Request, Response
|
|
24
23
|
from fastapi_versioning import version
|
|
@@ -67,7 +66,7 @@ async def ask_knowledgebox_endpoint(
|
|
|
67
66
|
description="When set to true, outputs response as JSON in a non-streaming way. "
|
|
68
67
|
"This is slower and requires waiting for entire answer to be ready.",
|
|
69
68
|
),
|
|
70
|
-
) ->
|
|
69
|
+
) -> StreamingResponse | HTTPClientError | Response:
|
|
71
70
|
current_user: NucliaUser = request.user
|
|
72
71
|
# If present, security groups from AuthorizationBackend overrides any
|
|
73
72
|
# security group of the payload
|
|
@@ -116,8 +115,8 @@ async def create_ask_response(
|
|
|
116
115
|
client_type: NucliaDBClientType,
|
|
117
116
|
origin: str,
|
|
118
117
|
x_synchronous: bool,
|
|
119
|
-
resource:
|
|
120
|
-
extra_predict_headers:
|
|
118
|
+
resource: str | None = None,
|
|
119
|
+
extra_predict_headers: dict[str, str] | None = None,
|
|
121
120
|
) -> Response:
|
|
122
121
|
maybe_log_request_payload(kbid, "/ask", ask_request)
|
|
123
122
|
ask_request.max_tokens = parse_max_tokens(ask_request.max_tokens)
|
|
@@ -0,0 +1,585 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
import asyncio
|
|
22
|
+
from typing import cast
|
|
23
|
+
|
|
24
|
+
from fastapi import Header, Request
|
|
25
|
+
from fastapi_versioning import version
|
|
26
|
+
|
|
27
|
+
from nucliadb.common.ids import FieldId, ParagraphId
|
|
28
|
+
from nucliadb.models.internal import augment as internal_augment
|
|
29
|
+
from nucliadb.models.internal.augment import (
|
|
30
|
+
Augment,
|
|
31
|
+
Augmented,
|
|
32
|
+
ConversationAnswerOrAfter,
|
|
33
|
+
ConversationAttachments,
|
|
34
|
+
ConversationAugment,
|
|
35
|
+
ConversationProp,
|
|
36
|
+
ConversationSelector,
|
|
37
|
+
ConversationText,
|
|
38
|
+
DeepResourceAugment,
|
|
39
|
+
FieldAugment,
|
|
40
|
+
FieldClassificationLabels,
|
|
41
|
+
FieldEntities,
|
|
42
|
+
FieldProp,
|
|
43
|
+
FieldText,
|
|
44
|
+
FileAugment,
|
|
45
|
+
FileProp,
|
|
46
|
+
FileThumbnail,
|
|
47
|
+
FullSelector,
|
|
48
|
+
MessageSelector,
|
|
49
|
+
Metadata,
|
|
50
|
+
Paragraph,
|
|
51
|
+
ParagraphAugment,
|
|
52
|
+
ParagraphImage,
|
|
53
|
+
ParagraphPage,
|
|
54
|
+
ParagraphPosition,
|
|
55
|
+
ParagraphProp,
|
|
56
|
+
ParagraphTable,
|
|
57
|
+
ParagraphText,
|
|
58
|
+
RelatedParagraphs,
|
|
59
|
+
ResourceAugment,
|
|
60
|
+
ResourceClassificationLabels,
|
|
61
|
+
ResourceProp,
|
|
62
|
+
ResourceSummary,
|
|
63
|
+
ResourceTitle,
|
|
64
|
+
WindowSelector,
|
|
65
|
+
)
|
|
66
|
+
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
|
67
|
+
from nucliadb.search.augmentor import augmentor
|
|
68
|
+
from nucliadb.search.search.cache import request_caches
|
|
69
|
+
from nucliadb_models.augment import (
|
|
70
|
+
AugmentedConversationField,
|
|
71
|
+
AugmentedConversationMessage,
|
|
72
|
+
AugmentedField,
|
|
73
|
+
AugmentedFileField,
|
|
74
|
+
AugmentedParagraph,
|
|
75
|
+
AugmentedResource,
|
|
76
|
+
AugmentParagraphs,
|
|
77
|
+
AugmentRequest,
|
|
78
|
+
AugmentResources,
|
|
79
|
+
AugmentResponse,
|
|
80
|
+
)
|
|
81
|
+
from nucliadb_models.common import FieldTypeName
|
|
82
|
+
from nucliadb_models.resource import ExtractedDataTypeName, NucliaDBRoles
|
|
83
|
+
from nucliadb_models.search import NucliaDBClientType, ResourceProperties
|
|
84
|
+
from nucliadb_utils.authentication import requires
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@api.post(
|
|
88
|
+
f"/{KB_PREFIX}/{{kbid}}/augment",
|
|
89
|
+
status_code=200,
|
|
90
|
+
description="Augment data on a Knowledge Box",
|
|
91
|
+
include_in_schema=False,
|
|
92
|
+
tags=["Augment"],
|
|
93
|
+
)
|
|
94
|
+
@requires(NucliaDBRoles.READER)
|
|
95
|
+
@version(1)
|
|
96
|
+
async def _augment_endpoint(
|
|
97
|
+
request: Request,
|
|
98
|
+
kbid: str,
|
|
99
|
+
item: AugmentRequest,
|
|
100
|
+
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
|
101
|
+
x_nucliadb_user: str = Header(""),
|
|
102
|
+
x_forwarded_for: str = Header(""),
|
|
103
|
+
) -> AugmentResponse:
|
|
104
|
+
return await augment_endpoint(kbid, item)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
async def augment_endpoint(kbid: str, item: AugmentRequest) -> AugmentResponse:
|
|
108
|
+
augmentations = parse_first_augments(item)
|
|
109
|
+
|
|
110
|
+
if len(augmentations) == 0:
|
|
111
|
+
return AugmentResponse(resources={}, fields={}, paragraphs={})
|
|
112
|
+
|
|
113
|
+
with request_caches():
|
|
114
|
+
max_ops = asyncio.Semaphore(50)
|
|
115
|
+
|
|
116
|
+
first_augmented = await augmentor.augment(kbid, augmentations, concurrency_control=max_ops)
|
|
117
|
+
response = build_augment_response(item, first_augmented)
|
|
118
|
+
|
|
119
|
+
# 2nd round trip to augmentor
|
|
120
|
+
#
|
|
121
|
+
# There are some augmentations that require some augmented content to be
|
|
122
|
+
# able to keep augmenting, as neighbour paragraphs.
|
|
123
|
+
#
|
|
124
|
+
# However, as many data is already cached (when using cache), this
|
|
125
|
+
# second round should be orders of magnitude faster than the first round.
|
|
126
|
+
#
|
|
127
|
+
augmentations = parse_second_augments(item, first_augmented)
|
|
128
|
+
if len(augmentations) > 0:
|
|
129
|
+
second_augmented = await augmentor.augment(kbid, augmentations, concurrency_control=max_ops)
|
|
130
|
+
merge_second_augment(item, response, second_augmented)
|
|
131
|
+
|
|
132
|
+
return response
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def parse_first_augments(item: AugmentRequest) -> list[Augment]:
|
|
136
|
+
"""Parse an augment request and return a list of internal augments to
|
|
137
|
+
fulfill as much as the requested information as it can.
|
|
138
|
+
|
|
139
|
+
Notice there are augments that will require a 2nd round trip to the
|
|
140
|
+
augmentor, e.g., neighbouring paragraphs. This makes code a bit more
|
|
141
|
+
convoluted but avoids synchronization between augments, as many paragraphs
|
|
142
|
+
could lead to the same neighbours.
|
|
143
|
+
|
|
144
|
+
"""
|
|
145
|
+
augmentations: list[Augment] = []
|
|
146
|
+
|
|
147
|
+
if item.resources is not None:
|
|
148
|
+
for resource_augment in item.resources:
|
|
149
|
+
show, extracted, resource_select = parse_deep_resource_augment(resource_augment)
|
|
150
|
+
if resource_augment.field_type_filter is None:
|
|
151
|
+
field_type_filter = list(FieldTypeName)
|
|
152
|
+
else:
|
|
153
|
+
field_type_filter = resource_augment.field_type_filter
|
|
154
|
+
|
|
155
|
+
if show:
|
|
156
|
+
augmentations.append(
|
|
157
|
+
DeepResourceAugment(
|
|
158
|
+
given=resource_augment.given,
|
|
159
|
+
show=show,
|
|
160
|
+
extracted=extracted,
|
|
161
|
+
field_type_filter=field_type_filter,
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
if resource_select:
|
|
165
|
+
augmentations.append(
|
|
166
|
+
ResourceAugment(
|
|
167
|
+
given=resource_augment.given, # type: ignore[arg-type]
|
|
168
|
+
select=resource_select,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
if resource_augment.fields is not None:
|
|
173
|
+
# Augment resource fields with an optional field filter
|
|
174
|
+
field_select: list[FieldProp] = []
|
|
175
|
+
if resource_augment.fields.text:
|
|
176
|
+
field_select.append(FieldText())
|
|
177
|
+
if resource_augment.fields.classification_labels:
|
|
178
|
+
field_select.append(FieldClassificationLabels())
|
|
179
|
+
|
|
180
|
+
augmentations.append(
|
|
181
|
+
FieldAugment(
|
|
182
|
+
given=resource_augment.given, # type: ignore[arg-type]
|
|
183
|
+
select=field_select, # type: ignore[arg-type]
|
|
184
|
+
filter=resource_augment.fields.filters,
|
|
185
|
+
)
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
if item.fields is not None:
|
|
189
|
+
for field_augment in item.fields:
|
|
190
|
+
given = [FieldId.from_string(id) for id in field_augment.given]
|
|
191
|
+
select: list[FieldProp] = []
|
|
192
|
+
if field_augment.text:
|
|
193
|
+
select.append(FieldText())
|
|
194
|
+
if field_augment.entities:
|
|
195
|
+
select.append(FieldEntities())
|
|
196
|
+
if field_augment.classification_labels:
|
|
197
|
+
select.append(FieldClassificationLabels())
|
|
198
|
+
|
|
199
|
+
if len(select) > 0:
|
|
200
|
+
augmentations.append(
|
|
201
|
+
FieldAugment(
|
|
202
|
+
given=given,
|
|
203
|
+
select=select,
|
|
204
|
+
)
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
file_select: list[FileProp] = []
|
|
208
|
+
if field_augment.file_thumbnail:
|
|
209
|
+
file_select.append(FileThumbnail())
|
|
210
|
+
|
|
211
|
+
if len(file_select) > 0:
|
|
212
|
+
augmentations.append(
|
|
213
|
+
FileAugment(
|
|
214
|
+
given=given, # type: ignore
|
|
215
|
+
select=file_select,
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
conversation_select: list[ConversationProp] = []
|
|
220
|
+
selector: ConversationSelector
|
|
221
|
+
|
|
222
|
+
if field_augment.full_conversation:
|
|
223
|
+
selector = FullSelector()
|
|
224
|
+
conversation_select.append(ConversationText(selector=selector))
|
|
225
|
+
if (
|
|
226
|
+
field_augment.conversation_text_attachments
|
|
227
|
+
or field_augment.conversation_image_attachments
|
|
228
|
+
):
|
|
229
|
+
conversation_select.append(ConversationAttachments(selector=selector))
|
|
230
|
+
|
|
231
|
+
elif field_augment.max_conversation_messages is not None:
|
|
232
|
+
# we want to always get the first conversation and the window
|
|
233
|
+
# requested by the user
|
|
234
|
+
first_selector = MessageSelector(index="first")
|
|
235
|
+
window_selector = WindowSelector(size=field_augment.max_conversation_messages)
|
|
236
|
+
conversation_select.append(ConversationText(selector=first_selector))
|
|
237
|
+
conversation_select.append(ConversationText(selector=window_selector))
|
|
238
|
+
if (
|
|
239
|
+
field_augment.conversation_text_attachments
|
|
240
|
+
or field_augment.conversation_image_attachments
|
|
241
|
+
):
|
|
242
|
+
conversation_select.append(ConversationAttachments(selector=first_selector))
|
|
243
|
+
conversation_select.append(ConversationAttachments(selector=window_selector))
|
|
244
|
+
|
|
245
|
+
if field_augment.conversation_answer_or_messages_after:
|
|
246
|
+
conversation_select.append(ConversationAnswerOrAfter())
|
|
247
|
+
|
|
248
|
+
if len(conversation_select) > 0:
|
|
249
|
+
augmentations.append(
|
|
250
|
+
ConversationAugment(
|
|
251
|
+
given=given, # type: ignore
|
|
252
|
+
select=conversation_select,
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
if item.paragraphs is not None:
|
|
257
|
+
for paragraph_augment in item.paragraphs:
|
|
258
|
+
paragraphs_to_augment, paragraph_selector = parse_paragraph_augment(paragraph_augment)
|
|
259
|
+
augmentations.append(
|
|
260
|
+
ParagraphAugment(
|
|
261
|
+
given=paragraphs_to_augment,
|
|
262
|
+
select=paragraph_selector,
|
|
263
|
+
)
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return augmentations
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def parse_deep_resource_augment(
|
|
270
|
+
item: AugmentResources,
|
|
271
|
+
) -> tuple[list[ResourceProperties], list[ExtractedDataTypeName], list[ResourceProp]]:
|
|
272
|
+
show = []
|
|
273
|
+
if item.basic:
|
|
274
|
+
show.append(ResourceProperties.BASIC)
|
|
275
|
+
if item.origin:
|
|
276
|
+
show.append(ResourceProperties.ORIGIN)
|
|
277
|
+
if item.extra:
|
|
278
|
+
show.append(ResourceProperties.EXTRA)
|
|
279
|
+
if item.relations:
|
|
280
|
+
show.append(ResourceProperties.RELATIONS)
|
|
281
|
+
if item.values:
|
|
282
|
+
show.append(ResourceProperties.VALUES)
|
|
283
|
+
if item.errors:
|
|
284
|
+
show.append(ResourceProperties.ERRORS)
|
|
285
|
+
if item.security:
|
|
286
|
+
show.append(ResourceProperties.SECURITY)
|
|
287
|
+
|
|
288
|
+
extracted = []
|
|
289
|
+
if item.extracted_text:
|
|
290
|
+
extracted.append(ExtractedDataTypeName.TEXT)
|
|
291
|
+
if item.extracted_metadata:
|
|
292
|
+
extracted.append(ExtractedDataTypeName.METADATA)
|
|
293
|
+
if item.extracted_shortened_metadata:
|
|
294
|
+
extracted.append(ExtractedDataTypeName.SHORTENED_METADATA)
|
|
295
|
+
if item.extracted_large_metadata:
|
|
296
|
+
extracted.append(ExtractedDataTypeName.LARGE_METADATA)
|
|
297
|
+
if item.extracted_vector:
|
|
298
|
+
extracted.append(ExtractedDataTypeName.VECTOR)
|
|
299
|
+
if item.extracted_link:
|
|
300
|
+
extracted.append(ExtractedDataTypeName.LINK)
|
|
301
|
+
if item.extracted_file:
|
|
302
|
+
extracted.append(ExtractedDataTypeName.FILE)
|
|
303
|
+
if item.extracted_qa:
|
|
304
|
+
extracted.append(ExtractedDataTypeName.QA)
|
|
305
|
+
|
|
306
|
+
if len(extracted) > 0:
|
|
307
|
+
show.append(ResourceProperties.EXTRACTED)
|
|
308
|
+
|
|
309
|
+
select: list[ResourceProp] = []
|
|
310
|
+
if item.title:
|
|
311
|
+
select.append(ResourceTitle())
|
|
312
|
+
if item.summary:
|
|
313
|
+
select.append(ResourceSummary())
|
|
314
|
+
if item.classification_labels:
|
|
315
|
+
select.append(ResourceClassificationLabels())
|
|
316
|
+
|
|
317
|
+
return (
|
|
318
|
+
show,
|
|
319
|
+
extracted,
|
|
320
|
+
select,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def parse_paragraph_augment(item: AugmentParagraphs) -> tuple[list[Paragraph], list[ParagraphProp]]:
|
|
325
|
+
paragraphs_to_augment = []
|
|
326
|
+
for paragraph in item.given:
|
|
327
|
+
try:
|
|
328
|
+
paragraph_id = ParagraphId.from_string(paragraph.id)
|
|
329
|
+
except ValueError:
|
|
330
|
+
# invalid paragraph id, skipping
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
if paragraph.metadata is None:
|
|
334
|
+
metadata = None
|
|
335
|
+
else:
|
|
336
|
+
metadata = Metadata(
|
|
337
|
+
is_an_image=paragraph.metadata.is_an_image,
|
|
338
|
+
is_a_table=paragraph.metadata.is_a_table,
|
|
339
|
+
source_file=paragraph.metadata.source_file,
|
|
340
|
+
page=paragraph.metadata.page,
|
|
341
|
+
in_page_with_visual=paragraph.metadata.in_page_with_visual,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
paragraphs_to_augment.append(Paragraph(id=paragraph_id, metadata=metadata))
|
|
345
|
+
|
|
346
|
+
selector: list[ParagraphProp] = []
|
|
347
|
+
if item.text:
|
|
348
|
+
selector.append(ParagraphText())
|
|
349
|
+
if item.neighbours_before or item.neighbours_after:
|
|
350
|
+
selector.append(
|
|
351
|
+
RelatedParagraphs(
|
|
352
|
+
neighbours_before=item.neighbours_before or 0,
|
|
353
|
+
neighbours_after=item.neighbours_after or 0,
|
|
354
|
+
)
|
|
355
|
+
)
|
|
356
|
+
if item.source_image:
|
|
357
|
+
selector.append(ParagraphImage())
|
|
358
|
+
if item.table_image:
|
|
359
|
+
selector.append(ParagraphTable(prefer_page_preview=item.table_prefers_page_preview))
|
|
360
|
+
if item.page_preview_image:
|
|
361
|
+
selector.append(ParagraphPage(preview=True))
|
|
362
|
+
|
|
363
|
+
return paragraphs_to_augment, selector
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def build_augment_response(item: AugmentRequest, augmented: Augmented) -> AugmentResponse:
|
|
367
|
+
response = AugmentResponse(
|
|
368
|
+
resources={},
|
|
369
|
+
fields={},
|
|
370
|
+
paragraphs={},
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# start with deep resources, as they return a Resource object we can merge
|
|
374
|
+
# with the augmented model
|
|
375
|
+
for rid, resource_deep in augmented.resources_deep.items():
|
|
376
|
+
if resource_deep is None:
|
|
377
|
+
continue
|
|
378
|
+
|
|
379
|
+
augmented_resource = AugmentedResource(id=rid)
|
|
380
|
+
augmented_resource.updated_from(resource_deep)
|
|
381
|
+
response.resources[rid] = augmented_resource
|
|
382
|
+
|
|
383
|
+
# now we can cherry pick properties from the augmented resources and merge
|
|
384
|
+
# them with the deep ones
|
|
385
|
+
for rid, resource in augmented.resources.items():
|
|
386
|
+
if resource is None:
|
|
387
|
+
continue
|
|
388
|
+
|
|
389
|
+
augmented_resource = response.resources.setdefault(rid, AugmentedResource(id=rid))
|
|
390
|
+
|
|
391
|
+
# merge resource with deep resources without overwriting
|
|
392
|
+
augmented_resource.title = augmented_resource.title or resource.title
|
|
393
|
+
augmented_resource.summary = augmented_resource.summary or resource.summary
|
|
394
|
+
|
|
395
|
+
# properties original to the augmented resources (not in deep resource augment)
|
|
396
|
+
if resource.classification_labels is not None:
|
|
397
|
+
augmented_resource.classification_labels = {
|
|
398
|
+
labelset: list(labels) for labelset, labels in resource.classification_labels.items()
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
for field_id, field in augmented.fields.items():
|
|
402
|
+
if field is None:
|
|
403
|
+
continue
|
|
404
|
+
|
|
405
|
+
# common augments for all fields
|
|
406
|
+
|
|
407
|
+
if field.classification_labels is None:
|
|
408
|
+
classification_labels = None
|
|
409
|
+
else:
|
|
410
|
+
classification_labels = {
|
|
411
|
+
labelset: list(labels) for labelset, labels in field.classification_labels.items()
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
if field.entities is None:
|
|
415
|
+
entities = None
|
|
416
|
+
else:
|
|
417
|
+
entities = {family: list(entity) for family, entity in field.entities.items()}
|
|
418
|
+
|
|
419
|
+
if field_id.type in (
|
|
420
|
+
FieldTypeName.TEXT.abbreviation(),
|
|
421
|
+
FieldTypeName.LINK.abbreviation(),
|
|
422
|
+
FieldTypeName.GENERIC.abbreviation(),
|
|
423
|
+
):
|
|
424
|
+
response.fields[field_id.full()] = AugmentedField(
|
|
425
|
+
text=field.text, # type: ignore # field is instance of any of the above and has the text property
|
|
426
|
+
classification_labels=classification_labels,
|
|
427
|
+
entities=entities,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
elif field_id.type == FieldTypeName.FILE.abbreviation():
|
|
431
|
+
field = cast(internal_augment.AugmentedFileField, field)
|
|
432
|
+
response.fields[field_id.full()] = AugmentedFileField(
|
|
433
|
+
text=field.text, # type: ignore # field is instance of any of the above and has the text property
|
|
434
|
+
classification_labels=classification_labels,
|
|
435
|
+
entities=entities,
|
|
436
|
+
thumbnail_image=field.thumbnail_path,
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
elif field_id.type == FieldTypeName.CONVERSATION.abbreviation():
|
|
440
|
+
field = cast(internal_augment.AugmentedConversationField, field)
|
|
441
|
+
conversation = AugmentedConversationField(
|
|
442
|
+
classification_labels=classification_labels,
|
|
443
|
+
entities=entities,
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
if field.messages is not None:
|
|
447
|
+
conversation.messages = []
|
|
448
|
+
for m in field.messages:
|
|
449
|
+
if m.attachments is None:
|
|
450
|
+
attachments = None
|
|
451
|
+
else:
|
|
452
|
+
attachments = []
|
|
453
|
+
for f in m.attachments:
|
|
454
|
+
attachments.append(f.full())
|
|
455
|
+
|
|
456
|
+
conversation.messages.append(
|
|
457
|
+
AugmentedConversationMessage(
|
|
458
|
+
ident=m.ident,
|
|
459
|
+
text=m.text,
|
|
460
|
+
attachments=attachments,
|
|
461
|
+
)
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
response.fields[field_id.full()] = conversation
|
|
465
|
+
|
|
466
|
+
else: # pragma: no cover
|
|
467
|
+
assert False, f"unknown field type: {field_id.type}"
|
|
468
|
+
|
|
469
|
+
for paragraph_id, paragraph in augmented.paragraphs.items():
|
|
470
|
+
if paragraph is None:
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
augmented_paragraph = AugmentedParagraph()
|
|
474
|
+
augmented_paragraph.text = paragraph.text
|
|
475
|
+
if paragraph.related is not None:
|
|
476
|
+
augmented_paragraph.neighbours_before = list(
|
|
477
|
+
map(lambda x: x.full(), paragraph.related.neighbours_before)
|
|
478
|
+
)
|
|
479
|
+
augmented_paragraph.neighbours_after = list(
|
|
480
|
+
map(lambda x: x.full(), paragraph.related.neighbours_after)
|
|
481
|
+
)
|
|
482
|
+
augmented_paragraph.source_image = paragraph.source_image_path
|
|
483
|
+
augmented_paragraph.table_image = paragraph.table_image_path
|
|
484
|
+
augmented_paragraph.page_preview_image = paragraph.page_preview_path
|
|
485
|
+
response.paragraphs[paragraph_id.full()] = augmented_paragraph
|
|
486
|
+
|
|
487
|
+
return response
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def parse_second_augments(item: AugmentRequest, augmented: Augmented) -> list[Augment]:
|
|
491
|
+
"""Given an augment request an a first augmentation, return a list of
|
|
492
|
+
augments required to fulfill the requested data.
|
|
493
|
+
|
|
494
|
+
"""
|
|
495
|
+
augmentations: list[Augment] = []
|
|
496
|
+
|
|
497
|
+
for paragraph_augment in item.paragraphs or []:
|
|
498
|
+
if paragraph_augment.neighbours_before or paragraph_augment.neighbours_after:
|
|
499
|
+
neighbours = []
|
|
500
|
+
for paragraph_id, paragraph in augmented.paragraphs.items():
|
|
501
|
+
if paragraph.related is not None:
|
|
502
|
+
for neighbour_before in paragraph.related.neighbours_before:
|
|
503
|
+
neighbours.append(Paragraph(id=neighbour_before, metadata=None))
|
|
504
|
+
for neighbour_after in paragraph.related.neighbours_after:
|
|
505
|
+
neighbours.append(Paragraph(id=neighbour_after, metadata=None))
|
|
506
|
+
|
|
507
|
+
if neighbours:
|
|
508
|
+
augmentations.append(
|
|
509
|
+
ParagraphAugment(
|
|
510
|
+
given=neighbours,
|
|
511
|
+
select=[
|
|
512
|
+
ParagraphText(),
|
|
513
|
+
ParagraphPosition(),
|
|
514
|
+
],
|
|
515
|
+
)
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
return augmentations
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def merge_second_augment(item: AugmentRequest, response: AugmentResponse, augmented: Augmented):
|
|
522
|
+
"""Merge in-place augmented data with an existing augment response."""
|
|
523
|
+
|
|
524
|
+
if any(
|
|
525
|
+
(
|
|
526
|
+
paragraph_augment.neighbours_before or paragraph_augment.neighbours_after
|
|
527
|
+
for paragraph_augment in item.paragraphs or []
|
|
528
|
+
)
|
|
529
|
+
):
|
|
530
|
+
# neighbour paragraphs
|
|
531
|
+
|
|
532
|
+
new_paragraphs = {}
|
|
533
|
+
for paragraph_id_str, augmented_paragraph in response.paragraphs.items():
|
|
534
|
+
before_refs = []
|
|
535
|
+
for before_id_str in augmented_paragraph.neighbours_before or []:
|
|
536
|
+
before_id = ParagraphId.from_string(before_id_str)
|
|
537
|
+
|
|
538
|
+
if before_id not in augmented.paragraphs:
|
|
539
|
+
continue
|
|
540
|
+
neighbour = augmented.paragraphs[before_id]
|
|
541
|
+
|
|
542
|
+
if before_id_str not in response.paragraphs:
|
|
543
|
+
if not neighbour.text and not neighbour.position:
|
|
544
|
+
continue
|
|
545
|
+
# create a new paragraph for the neighbour
|
|
546
|
+
new_paragraphs[before_id_str] = AugmentedParagraph(
|
|
547
|
+
text=neighbour.text, position=neighbour.position
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
else:
|
|
551
|
+
# merge neighbour with existing paragraph
|
|
552
|
+
if not response.paragraphs[before_id_str].text:
|
|
553
|
+
response.paragraphs[before_id_str].text = neighbour.text
|
|
554
|
+
|
|
555
|
+
before_refs.append(before_id_str)
|
|
556
|
+
|
|
557
|
+
after_refs = []
|
|
558
|
+
for after_id_str in augmented_paragraph.neighbours_after or []:
|
|
559
|
+
after_id = ParagraphId.from_string(after_id_str)
|
|
560
|
+
|
|
561
|
+
if after_id not in augmented.paragraphs:
|
|
562
|
+
continue
|
|
563
|
+
neighbour = augmented.paragraphs[after_id]
|
|
564
|
+
|
|
565
|
+
if after_id_str not in response.paragraphs:
|
|
566
|
+
if not neighbour.text and not neighbour.position:
|
|
567
|
+
continue
|
|
568
|
+
# create a new paragraph for the neighbour
|
|
569
|
+
new_paragraphs[after_id_str] = AugmentedParagraph(
|
|
570
|
+
text=neighbour.text, position=neighbour.position
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
else:
|
|
574
|
+
# merge neighbour with existing paragraph
|
|
575
|
+
if not response.paragraphs[after_id_str].text:
|
|
576
|
+
response.paragraphs[after_id_str].text = neighbour.text
|
|
577
|
+
|
|
578
|
+
after_refs.append(after_id_str)
|
|
579
|
+
|
|
580
|
+
# update references to contain only the neighbours that existed in
|
|
581
|
+
# the response or we added
|
|
582
|
+
augmented_paragraph.neighbours_before = before_refs
|
|
583
|
+
augmented_paragraph.neighbours_after = after_refs
|
|
584
|
+
|
|
585
|
+
response.paragraphs.update(new_paragraphs)
|