nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
from nucliadb_telemetry.metrics import Observer
|
|
23
|
+
|
|
24
|
+
augmentor_observer = Observer("augmentor", labels={"type": ""})
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
import asyncio
|
|
21
|
+
from collections.abc import Sequence
|
|
22
|
+
from typing import cast
|
|
23
|
+
|
|
24
|
+
from typing_extensions import assert_never
|
|
25
|
+
|
|
26
|
+
from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, ParagraphId
|
|
27
|
+
from nucliadb.ingest.fields.base import Field
|
|
28
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
29
|
+
from nucliadb.models.internal.augment import (
|
|
30
|
+
AugmentedParagraph,
|
|
31
|
+
AugmentedRelatedParagraphs,
|
|
32
|
+
Metadata,
|
|
33
|
+
Paragraph,
|
|
34
|
+
ParagraphImage,
|
|
35
|
+
ParagraphPage,
|
|
36
|
+
ParagraphPosition,
|
|
37
|
+
ParagraphProp,
|
|
38
|
+
ParagraphTable,
|
|
39
|
+
ParagraphText,
|
|
40
|
+
RelatedParagraphs,
|
|
41
|
+
)
|
|
42
|
+
from nucliadb.search.augmentor.metrics import augmentor_observer
|
|
43
|
+
from nucliadb.search.augmentor.utils import limited_concurrency
|
|
44
|
+
from nucliadb.search.search import cache
|
|
45
|
+
from nucliadb.search.search.paragraphs import get_paragraph_from_full_text
|
|
46
|
+
from nucliadb_models.search import TextPosition
|
|
47
|
+
from nucliadb_protos import resources_pb2
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
async def augment_paragraphs(
|
|
51
|
+
kbid: str,
|
|
52
|
+
given: list[Paragraph],
|
|
53
|
+
select: list[ParagraphProp],
|
|
54
|
+
*,
|
|
55
|
+
concurrency_control: asyncio.Semaphore | None = None,
|
|
56
|
+
) -> dict[ParagraphId, AugmentedParagraph | None]:
|
|
57
|
+
"""Augment a list of paragraphs following an augmentation"""
|
|
58
|
+
|
|
59
|
+
ops = []
|
|
60
|
+
for paragraph in given:
|
|
61
|
+
task = asyncio.create_task(
|
|
62
|
+
limited_concurrency(
|
|
63
|
+
augment_paragraph(kbid, paragraph.id, select, paragraph.metadata),
|
|
64
|
+
max_ops=concurrency_control,
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
ops.append(task)
|
|
68
|
+
results: list[AugmentedParagraph | None] = await asyncio.gather(*ops)
|
|
69
|
+
|
|
70
|
+
augmented = {}
|
|
71
|
+
for paragraph, augmentation in zip(given, results):
|
|
72
|
+
augmented[paragraph.id] = augmentation
|
|
73
|
+
|
|
74
|
+
return augmented
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def augment_paragraph(
|
|
78
|
+
kbid: str,
|
|
79
|
+
paragraph_id: ParagraphId,
|
|
80
|
+
select: list[ParagraphProp],
|
|
81
|
+
metadata: Metadata | None,
|
|
82
|
+
) -> AugmentedParagraph | None:
|
|
83
|
+
rid = paragraph_id.rid
|
|
84
|
+
resource = await cache.get_resource(kbid, rid)
|
|
85
|
+
if resource is None:
|
|
86
|
+
# skip resources that aren't in the DB
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
field_id = paragraph_id.field_id
|
|
90
|
+
field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
|
|
91
|
+
# we must check if field exists or get_field will return an empty field
|
|
92
|
+
# (behaviour thought for ingestion) that we don't want
|
|
93
|
+
if not (await resource.field_exists(field_type_pb, field_id.key)):
|
|
94
|
+
# skip a fields that aren't in the DB
|
|
95
|
+
return None
|
|
96
|
+
field = await resource.get_field(field_id.key, field_id.pb_type)
|
|
97
|
+
|
|
98
|
+
return await db_augment_paragraph(resource, field, paragraph_id, select, metadata)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def db_augment_paragraph(
|
|
102
|
+
resource: Resource,
|
|
103
|
+
field: Field,
|
|
104
|
+
paragraph_id: ParagraphId,
|
|
105
|
+
select: list[ParagraphProp],
|
|
106
|
+
metadata: Metadata | None,
|
|
107
|
+
) -> AugmentedParagraph:
|
|
108
|
+
select = dedup_paragraph_select(select)
|
|
109
|
+
|
|
110
|
+
# we use an accessor to get the metadata to avoid unnecessary DB round
|
|
111
|
+
# trips. With this, we'll only fetch it one and only if we need it
|
|
112
|
+
_metadata = metadata
|
|
113
|
+
_metadata_available = True
|
|
114
|
+
|
|
115
|
+
async def access_metadata() -> Metadata | None:
|
|
116
|
+
nonlocal _metadata, _metadata_available
|
|
117
|
+
|
|
118
|
+
if _metadata is None and _metadata_available:
|
|
119
|
+
_metadata = await db_paragraph_metadata(field, paragraph_id)
|
|
120
|
+
|
|
121
|
+
if _metadata is None:
|
|
122
|
+
_metadata_available = False
|
|
123
|
+
|
|
124
|
+
return _metadata
|
|
125
|
+
|
|
126
|
+
text = None
|
|
127
|
+
position = None
|
|
128
|
+
image_path = None
|
|
129
|
+
table_path = None
|
|
130
|
+
page_preview_path = None
|
|
131
|
+
related = None
|
|
132
|
+
for prop in select:
|
|
133
|
+
if isinstance(prop, ParagraphText):
|
|
134
|
+
text = await get_paragraph_text(field, paragraph_id)
|
|
135
|
+
|
|
136
|
+
elif isinstance(prop, ParagraphPosition):
|
|
137
|
+
position = await get_paragraph_position(field, paragraph_id)
|
|
138
|
+
|
|
139
|
+
elif isinstance(prop, ParagraphImage):
|
|
140
|
+
metadata = await access_metadata()
|
|
141
|
+
if metadata is None:
|
|
142
|
+
continue
|
|
143
|
+
if metadata.is_an_image and metadata.source_file:
|
|
144
|
+
image_path = f"generated/{metadata.source_file}"
|
|
145
|
+
|
|
146
|
+
elif isinstance(prop, ParagraphTable):
|
|
147
|
+
metadata = await access_metadata()
|
|
148
|
+
if metadata is None:
|
|
149
|
+
continue
|
|
150
|
+
if metadata.is_a_table:
|
|
151
|
+
if prop.prefer_page_preview and metadata.page and metadata.in_page_with_visual:
|
|
152
|
+
page_preview_path = f"generated/extracted_images_{metadata.page}.png"
|
|
153
|
+
table_path = page_preview_path
|
|
154
|
+
elif metadata.source_file:
|
|
155
|
+
image_path = f"generated/{metadata.source_file}"
|
|
156
|
+
table_path = image_path
|
|
157
|
+
|
|
158
|
+
elif isinstance(prop, ParagraphPage):
|
|
159
|
+
if prop.preview:
|
|
160
|
+
metadata = await access_metadata()
|
|
161
|
+
if metadata is None:
|
|
162
|
+
continue
|
|
163
|
+
if metadata.page and metadata.in_page_with_visual:
|
|
164
|
+
page_preview_path = f"generated/extracted_images_{metadata.page}.png"
|
|
165
|
+
|
|
166
|
+
elif isinstance(prop, RelatedParagraphs):
|
|
167
|
+
related = await related_paragraphs(
|
|
168
|
+
field,
|
|
169
|
+
paragraph_id,
|
|
170
|
+
neighbours_before=prop.neighbours_before,
|
|
171
|
+
neighbours_after=prop.neighbours_after,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
else: # pragma: no cover
|
|
175
|
+
assert_never(prop)
|
|
176
|
+
|
|
177
|
+
return AugmentedParagraph(
|
|
178
|
+
id=paragraph_id,
|
|
179
|
+
text=text,
|
|
180
|
+
position=position,
|
|
181
|
+
source_image_path=image_path,
|
|
182
|
+
table_image_path=table_path,
|
|
183
|
+
page_preview_path=page_preview_path,
|
|
184
|
+
related=related,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def dedup_paragraph_select(select: list[ParagraphProp]) -> list[ParagraphProp]:
|
|
189
|
+
"""Merge any duplicated property taking the broader augmentation possible."""
|
|
190
|
+
merged = {}
|
|
191
|
+
for prop in select:
|
|
192
|
+
if prop.prop not in merged:
|
|
193
|
+
merged[prop.prop] = prop
|
|
194
|
+
|
|
195
|
+
else:
|
|
196
|
+
m = merged[prop.prop]
|
|
197
|
+
|
|
198
|
+
if (
|
|
199
|
+
isinstance(prop, ParagraphText)
|
|
200
|
+
or isinstance(prop, ParagraphPosition)
|
|
201
|
+
or isinstance(prop, ParagraphImage)
|
|
202
|
+
):
|
|
203
|
+
# properties without parameters
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
elif isinstance(prop, ParagraphTable):
|
|
207
|
+
prop = cast(ParagraphTable, prop)
|
|
208
|
+
m = cast(ParagraphTable, m)
|
|
209
|
+
m.prefer_page_preview = m.prefer_page_preview or prop.prefer_page_preview
|
|
210
|
+
|
|
211
|
+
elif isinstance(prop, ParagraphPage):
|
|
212
|
+
prop = cast(ParagraphPage, prop)
|
|
213
|
+
m = cast(ParagraphPage, m)
|
|
214
|
+
m.preview = m.preview or prop.preview
|
|
215
|
+
|
|
216
|
+
elif isinstance(prop, RelatedParagraphs):
|
|
217
|
+
prop = cast(RelatedParagraphs, prop)
|
|
218
|
+
m = cast(RelatedParagraphs, m)
|
|
219
|
+
m.neighbours_before = max(m.neighbours_before, prop.neighbours_before)
|
|
220
|
+
m.neighbours_after = max(m.neighbours_after, prop.neighbours_after)
|
|
221
|
+
|
|
222
|
+
else: # pragma: no cover
|
|
223
|
+
assert_never(prop)
|
|
224
|
+
|
|
225
|
+
return list(merged.values())
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
async def db_paragraph_metadata(field: Field, paragraph_id: ParagraphId) -> Metadata | None:
|
|
229
|
+
"""Obtain paragraph metadata from the source of truth (maindb/blob).
|
|
230
|
+
|
|
231
|
+
This operation may require data from blob storage, which makes it costly.
|
|
232
|
+
|
|
233
|
+
"""
|
|
234
|
+
field_paragraphs = await get_field_paragraphs(field)
|
|
235
|
+
if field_paragraphs is None:
|
|
236
|
+
# We don't have paragraph metadata for this field, we can't do anything
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
for paragraph in field_paragraphs:
|
|
240
|
+
field_paragraph_id = field.field_id.paragraph_id(paragraph.start, paragraph.end)
|
|
241
|
+
if field_paragraph_id == paragraph_id:
|
|
242
|
+
metadata = Metadata.from_db_paragraph(paragraph)
|
|
243
|
+
return metadata
|
|
244
|
+
else:
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
async def get_field_paragraphs(field: Field) -> Sequence[resources_pb2.Paragraph] | None:
|
|
249
|
+
field_metadata = await field.get_field_metadata()
|
|
250
|
+
if field_metadata is None:
|
|
251
|
+
return None
|
|
252
|
+
|
|
253
|
+
field_id = field.field_id
|
|
254
|
+
if field_id.subfield_id is None:
|
|
255
|
+
field_paragraphs = field_metadata.metadata.paragraphs
|
|
256
|
+
else:
|
|
257
|
+
field_paragraphs = field_metadata.split_metadata[field_id.subfield_id].paragraphs
|
|
258
|
+
|
|
259
|
+
return field_paragraphs
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@augmentor_observer.wrap({"type": "paragraph_text"})
|
|
263
|
+
async def get_paragraph_text(field: Field, paragraph_id: ParagraphId) -> str | None:
|
|
264
|
+
text = await get_paragraph_from_full_text(
|
|
265
|
+
field=field,
|
|
266
|
+
start=paragraph_id.paragraph_start,
|
|
267
|
+
end=paragraph_id.paragraph_end,
|
|
268
|
+
split=paragraph_id.field_id.subfield_id,
|
|
269
|
+
log_on_missing_field=True,
|
|
270
|
+
)
|
|
271
|
+
# we want to be explicit with not having the paragraph text but the function
|
|
272
|
+
# above returns an empty string if it can't find it
|
|
273
|
+
return text or None
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
async def get_paragraph_position(field: Field, paragraph_id: ParagraphId) -> TextPosition | None:
|
|
277
|
+
field_paragraphs = await get_field_paragraphs(field)
|
|
278
|
+
if field_paragraphs is None:
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
idx: int | None
|
|
282
|
+
for idx, paragraph in enumerate(field_paragraphs):
|
|
283
|
+
field_paragraph_id = field.field_id.paragraph_id(paragraph.start, paragraph.end)
|
|
284
|
+
if field_paragraph_id == paragraph_id:
|
|
285
|
+
break
|
|
286
|
+
else:
|
|
287
|
+
# we haven't found the paragraph, we can't provide a position
|
|
288
|
+
return None
|
|
289
|
+
|
|
290
|
+
return TextPosition(
|
|
291
|
+
index=idx,
|
|
292
|
+
start=paragraph.start,
|
|
293
|
+
end=paragraph.end,
|
|
294
|
+
start_seconds=list(paragraph.start_seconds),
|
|
295
|
+
end_seconds=list(paragraph.end_seconds),
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
async def related_paragraphs(
|
|
300
|
+
field: Field,
|
|
301
|
+
paragraph_id: ParagraphId,
|
|
302
|
+
*,
|
|
303
|
+
neighbours_before: int = 0,
|
|
304
|
+
neighbours_after: int = 0,
|
|
305
|
+
) -> AugmentedRelatedParagraphs | None:
|
|
306
|
+
field_paragraphs = await get_field_paragraphs(field)
|
|
307
|
+
if field_paragraphs is None:
|
|
308
|
+
return None
|
|
309
|
+
|
|
310
|
+
idx: int | None
|
|
311
|
+
for idx, paragraph in enumerate(field_paragraphs):
|
|
312
|
+
field_paragraph_id = field.field_id.paragraph_id(paragraph.start, paragraph.end)
|
|
313
|
+
if field_paragraph_id == paragraph_id:
|
|
314
|
+
break
|
|
315
|
+
else:
|
|
316
|
+
# we haven't found the paragraph, we won't find any related either
|
|
317
|
+
return None
|
|
318
|
+
|
|
319
|
+
before = []
|
|
320
|
+
for idx_before in range(max(idx - neighbours_before, 0), idx):
|
|
321
|
+
paragraph = field_paragraphs[idx_before]
|
|
322
|
+
paragraph_id = field.field_id.paragraph_id(paragraph.start, paragraph.end)
|
|
323
|
+
before.append(paragraph_id)
|
|
324
|
+
|
|
325
|
+
after = []
|
|
326
|
+
for idx_after in range(idx + 1, min(idx + 1 + neighbours_after, len(field_paragraphs))):
|
|
327
|
+
paragraph = field_paragraphs[idx_after]
|
|
328
|
+
paragraph_id = field.field_id.paragraph_id(paragraph.start, paragraph.end)
|
|
329
|
+
after.append(paragraph_id)
|
|
330
|
+
|
|
331
|
+
return AugmentedRelatedParagraphs(
|
|
332
|
+
neighbours_before=before,
|
|
333
|
+
neighbours_after=after,
|
|
334
|
+
)
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
import asyncio
|
|
21
|
+
|
|
22
|
+
from typing_extensions import assert_never
|
|
23
|
+
|
|
24
|
+
import nucliadb_models.resource
|
|
25
|
+
from nucliadb.common import datamanagers
|
|
26
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
27
|
+
from nucliadb.ingest.serialize import (
|
|
28
|
+
serialize_extra,
|
|
29
|
+
serialize_origin,
|
|
30
|
+
serialize_resource,
|
|
31
|
+
serialize_security,
|
|
32
|
+
)
|
|
33
|
+
from nucliadb.models.internal.augment import (
|
|
34
|
+
AugmentedResource,
|
|
35
|
+
ResourceClassificationLabels,
|
|
36
|
+
ResourceExtra,
|
|
37
|
+
ResourceOrigin,
|
|
38
|
+
ResourceProp,
|
|
39
|
+
ResourceSecurity,
|
|
40
|
+
ResourceSummary,
|
|
41
|
+
ResourceTitle,
|
|
42
|
+
)
|
|
43
|
+
from nucliadb.search.augmentor.metrics import augmentor_observer
|
|
44
|
+
from nucliadb.search.augmentor.utils import limited_concurrency
|
|
45
|
+
from nucliadb.search.search import cache
|
|
46
|
+
from nucliadb.search.search.hydrator import ResourceHydrationOptions
|
|
47
|
+
from nucliadb_models.search import ResourceProperties
|
|
48
|
+
from nucliadb_protos import resources_pb2
|
|
49
|
+
from nucliadb_utils import const
|
|
50
|
+
from nucliadb_utils.utilities import has_feature
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
async def augment_resources(
|
|
54
|
+
kbid: str,
|
|
55
|
+
given: list[str],
|
|
56
|
+
select: list[ResourceProp],
|
|
57
|
+
*,
|
|
58
|
+
concurrency_control: asyncio.Semaphore | None = None,
|
|
59
|
+
) -> dict[str, AugmentedResource | None]:
|
|
60
|
+
"""Augment a list of resources following an augmentation"""
|
|
61
|
+
|
|
62
|
+
ops = []
|
|
63
|
+
for rid in given:
|
|
64
|
+
task = asyncio.create_task(
|
|
65
|
+
limited_concurrency(
|
|
66
|
+
augment_resource(kbid, rid, select),
|
|
67
|
+
max_ops=concurrency_control,
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
ops.append(task)
|
|
71
|
+
results: list[AugmentedResource | None] = await asyncio.gather(*ops)
|
|
72
|
+
|
|
73
|
+
augmented = {}
|
|
74
|
+
for rid, augmentation in zip(given, results):
|
|
75
|
+
augmented[rid] = augmentation
|
|
76
|
+
|
|
77
|
+
return augmented
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def augment_resource(
|
|
81
|
+
kbid: str,
|
|
82
|
+
rid: str,
|
|
83
|
+
select: list[ResourceProp],
|
|
84
|
+
) -> AugmentedResource | None:
|
|
85
|
+
resource = await cache.get_resource(kbid, rid)
|
|
86
|
+
if resource is None:
|
|
87
|
+
# skip resources that aren't in the DB
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
return await db_augment_resource(resource, select)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@augmentor_observer.wrap({"type": "db_resource"})
|
|
94
|
+
async def db_augment_resource(
|
|
95
|
+
resource: Resource,
|
|
96
|
+
select: list[ResourceProp],
|
|
97
|
+
) -> AugmentedResource:
|
|
98
|
+
select = dedup_resource_select(select)
|
|
99
|
+
|
|
100
|
+
title = None
|
|
101
|
+
summary = None
|
|
102
|
+
origin = None
|
|
103
|
+
extra = None
|
|
104
|
+
security = None
|
|
105
|
+
labels = None
|
|
106
|
+
|
|
107
|
+
basic = None
|
|
108
|
+
for prop in select:
|
|
109
|
+
if isinstance(prop, ResourceTitle):
|
|
110
|
+
if basic is None:
|
|
111
|
+
basic = await resource.get_basic()
|
|
112
|
+
if basic is not None:
|
|
113
|
+
title = basic.title
|
|
114
|
+
|
|
115
|
+
elif isinstance(prop, ResourceSummary):
|
|
116
|
+
if basic is None:
|
|
117
|
+
basic = await resource.get_basic()
|
|
118
|
+
if basic is not None:
|
|
119
|
+
summary = basic.summary
|
|
120
|
+
|
|
121
|
+
elif isinstance(prop, ResourceOrigin):
|
|
122
|
+
origin = await serialize_origin(resource)
|
|
123
|
+
|
|
124
|
+
elif isinstance(prop, ResourceExtra):
|
|
125
|
+
extra = await serialize_extra(resource)
|
|
126
|
+
|
|
127
|
+
elif isinstance(prop, ResourceSecurity):
|
|
128
|
+
security = await serialize_security(resource)
|
|
129
|
+
|
|
130
|
+
elif isinstance(prop, ResourceClassificationLabels):
|
|
131
|
+
labels = await classification_labels(resource)
|
|
132
|
+
|
|
133
|
+
else:
|
|
134
|
+
assert_never(prop)
|
|
135
|
+
|
|
136
|
+
augmented = AugmentedResource(
|
|
137
|
+
id=resource.uuid,
|
|
138
|
+
title=title,
|
|
139
|
+
summary=summary,
|
|
140
|
+
origin=origin,
|
|
141
|
+
extra=extra,
|
|
142
|
+
security=security,
|
|
143
|
+
classification_labels=labels,
|
|
144
|
+
)
|
|
145
|
+
return augmented
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def dedup_resource_select(select: list[ResourceProp]) -> list[ResourceProp]:
|
|
149
|
+
# there's no resource prop with fields that need special treatement to
|
|
150
|
+
# merge, just get by unique prop id
|
|
151
|
+
merged: dict[str, ResourceProp] = {}
|
|
152
|
+
for prop in select:
|
|
153
|
+
merged.setdefault(prop.prop, prop)
|
|
154
|
+
return list(merged.values())
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
async def get_basic(resource: Resource) -> resources_pb2.Basic | None:
|
|
158
|
+
# HACK: resource.get_basic() always returns a pb, even if it's not in the
|
|
159
|
+
# DB. Here we really want to know if there's basic or not
|
|
160
|
+
basic = await datamanagers.resources.get_basic(resource.txn, kbid=resource.kbid, rid=resource.uuid)
|
|
161
|
+
return basic
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
async def classification_labels(resource: Resource) -> dict[str, set[str]] | None:
|
|
165
|
+
basic = await get_basic(resource)
|
|
166
|
+
if basic is None:
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
labels: dict[str, set[str]] = {}
|
|
170
|
+
for classification in basic.usermetadata.classifications:
|
|
171
|
+
labels.setdefault(classification.labelset, set()).add(classification.label)
|
|
172
|
+
return labels
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
async def augment_resources_deep(
|
|
176
|
+
kbid: str,
|
|
177
|
+
given: list[str],
|
|
178
|
+
opts: ResourceHydrationOptions,
|
|
179
|
+
*,
|
|
180
|
+
concurrency_control: asyncio.Semaphore | None = None,
|
|
181
|
+
) -> dict[str, nucliadb_models.resource.Resource | None]:
|
|
182
|
+
"""Augment resources using the Resource model. Depending on the options,
|
|
183
|
+
this can serialize resource fields, extracted data like text, vectors...
|
|
184
|
+
|
|
185
|
+
Thus, this operation can be quite expensive.
|
|
186
|
+
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
if ResourceProperties.EXTRACTED in opts.show and has_feature(
|
|
190
|
+
const.Features.IGNORE_EXTRACTED_IN_SEARCH, context={"kbid": kbid}, default=False
|
|
191
|
+
):
|
|
192
|
+
# Returning extracted metadata in search results is deprecated and this flag
|
|
193
|
+
# will be set to True for all KBs in the future.
|
|
194
|
+
opts.show.remove(ResourceProperties.EXTRACTED)
|
|
195
|
+
opts.extracted.clear()
|
|
196
|
+
|
|
197
|
+
ops = []
|
|
198
|
+
for rid in given:
|
|
199
|
+
task = asyncio.create_task(
|
|
200
|
+
limited_concurrency(
|
|
201
|
+
augment_resource_deep(kbid, rid, opts),
|
|
202
|
+
max_ops=concurrency_control,
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
ops.append(task)
|
|
206
|
+
results: list[nucliadb_models.resource.Resource | None] = await asyncio.gather(*ops)
|
|
207
|
+
|
|
208
|
+
augmented: dict[str, nucliadb_models.resource.Resource | None] = {}
|
|
209
|
+
for rid, augmentation in zip(given, results):
|
|
210
|
+
augmented[rid] = augmentation
|
|
211
|
+
|
|
212
|
+
return augmented
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
@augmentor_observer.wrap({"type": "seialize_resource"})
|
|
216
|
+
async def augment_resource_deep(
|
|
217
|
+
kbid: str,
|
|
218
|
+
rid: str,
|
|
219
|
+
opts: ResourceHydrationOptions,
|
|
220
|
+
) -> nucliadb_models.resource.Resource | None:
|
|
221
|
+
"""Augment a resource using the Resource model. Depending on the options,
|
|
222
|
+
this can serialize resource fields, extracted data like text, vectors...
|
|
223
|
+
|
|
224
|
+
Thus, this operation can be quite expensive.
|
|
225
|
+
|
|
226
|
+
"""
|
|
227
|
+
resource = await cache.get_resource(kbid, rid)
|
|
228
|
+
if resource is None:
|
|
229
|
+
# skip resources that aren't in the DB
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
serialized = await serialize_resource(
|
|
233
|
+
resource,
|
|
234
|
+
show=opts.show,
|
|
235
|
+
field_type_filter=opts.field_type_filter,
|
|
236
|
+
extracted=opts.extracted,
|
|
237
|
+
)
|
|
238
|
+
return serialized
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
import asyncio
|
|
21
|
+
from collections.abc import Awaitable
|
|
22
|
+
from contextlib import AsyncExitStack
|
|
23
|
+
from typing import TypeVar
|
|
24
|
+
|
|
25
|
+
T = TypeVar("T")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def limited_concurrency(aw: Awaitable[T], *, max_ops: asyncio.Semaphore | None) -> T:
|
|
29
|
+
async with AsyncExitStack() as stack:
|
|
30
|
+
if max_ops is not None:
|
|
31
|
+
await stack.enter_async_context(max_ops)
|
|
32
|
+
r = await aw
|
|
33
|
+
return r
|
nucliadb/search/lifecycle.py
CHANGED
|
@@ -24,7 +24,7 @@ from fastapi import FastAPI
|
|
|
24
24
|
from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
|
|
25
25
|
from nucliadb.common.context.fastapi import inject_app_context
|
|
26
26
|
from nucliadb.common.maindb.utils import setup_driver
|
|
27
|
-
from nucliadb.common.nidx import start_nidx_utility
|
|
27
|
+
from nucliadb.common.nidx import start_nidx_utility, stop_nidx_utility
|
|
28
28
|
from nucliadb.ingest.utils import start_ingest, stop_ingest
|
|
29
29
|
from nucliadb.search import SERVICE_NAME
|
|
30
30
|
from nucliadb.search.predict import start_predict_engine
|
|
@@ -61,6 +61,8 @@ async def lifespan(app: FastAPI):
|
|
|
61
61
|
if get_utility(Utility.PREDICT):
|
|
62
62
|
clean_utility(Utility.PREDICT)
|
|
63
63
|
|
|
64
|
+
await stop_nidx_utility()
|
|
65
|
+
|
|
64
66
|
await finalize_utilities()
|
|
65
67
|
await stop_audit_utility()
|
|
66
68
|
await teardown_cluster()
|