nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
from pydantic import BaseModel
|
|
21
|
+
|
|
22
|
+
from nucliadb_models.common import FieldTypeName
|
|
23
|
+
from nucliadb_models.resource import ExtractedDataTypeName
|
|
24
|
+
from nucliadb_models.search import ResourceProperties
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ResourceHydrationOptions(BaseModel):
|
|
28
|
+
"""
|
|
29
|
+
Options for hydrating resources.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
show: list[ResourceProperties] = []
|
|
33
|
+
extracted: list[ExtractedDataTypeName] = []
|
|
34
|
+
field_type_filter: list[FieldTypeName] = []
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TextBlockHydrationOptions(BaseModel):
|
|
38
|
+
"""
|
|
39
|
+
Options for hydrating text blocks (aka paragraphs).
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
# whether to highlight the text block with `<mark>...</mark>` tags or not
|
|
43
|
+
highlight: bool = False
|
|
44
|
+
|
|
45
|
+
# list of exact matches to highlight
|
|
46
|
+
ematches: list[str] | None = None
|
|
47
|
+
|
|
48
|
+
# If true, only hydrate the text block if its text is not already populated
|
|
49
|
+
only_hydrate_empty: bool = False
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
from typing import cast
|
|
21
|
+
|
|
22
|
+
from typing_extensions import assert_never
|
|
23
|
+
|
|
24
|
+
from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId
|
|
25
|
+
from nucliadb.ingest.fields.base import Field
|
|
26
|
+
from nucliadb.ingest.fields.conversation import Conversation
|
|
27
|
+
from nucliadb.ingest.fields.file import File
|
|
28
|
+
from nucliadb.ingest.fields.generic import Generic
|
|
29
|
+
from nucliadb.ingest.fields.link import Link
|
|
30
|
+
from nucliadb.ingest.fields.text import Text
|
|
31
|
+
from nucliadb.models.internal.augment import ConversationProp, FieldProp, FieldText, FieldValue
|
|
32
|
+
from nucliadb.search.augmentor.fields import (
|
|
33
|
+
db_augment_conversation_field,
|
|
34
|
+
db_augment_file_field,
|
|
35
|
+
db_augment_generic_field,
|
|
36
|
+
db_augment_link_field,
|
|
37
|
+
db_augment_text_field,
|
|
38
|
+
)
|
|
39
|
+
from nucliadb_models import hydration as hydration_models
|
|
40
|
+
from nucliadb_models.common import FieldTypeName
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def page_preview_id(page_number: int) -> str:
|
|
44
|
+
"""Return the string page number for an specific page"""
|
|
45
|
+
return f"{page_number}"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def hydrate_field(field: Field, field_id: FieldId, config: hydration_models.FieldHydration):
|
|
49
|
+
field_type = FIELD_TYPE_STR_TO_NAME[field_id.type]
|
|
50
|
+
|
|
51
|
+
if field_type == FieldTypeName.TEXT:
|
|
52
|
+
if not config.text is not None:
|
|
53
|
+
return
|
|
54
|
+
field = cast(Text, field)
|
|
55
|
+
return await hydrate_text_field(field, field_id, config.text)
|
|
56
|
+
|
|
57
|
+
elif field_type == FieldTypeName.FILE is not None:
|
|
58
|
+
if not config.file:
|
|
59
|
+
return
|
|
60
|
+
field = cast(File, field)
|
|
61
|
+
return await hydrate_file_field(field, field_id, config.file)
|
|
62
|
+
|
|
63
|
+
elif field_type == FieldTypeName.LINK is not None:
|
|
64
|
+
if not config.link:
|
|
65
|
+
return
|
|
66
|
+
field = cast(Link, field)
|
|
67
|
+
return await hydrate_link_field(field, field_id, config.link)
|
|
68
|
+
|
|
69
|
+
elif field_type == FieldTypeName.CONVERSATION is not None:
|
|
70
|
+
if not config.conversation:
|
|
71
|
+
return
|
|
72
|
+
field = cast(Conversation, field)
|
|
73
|
+
return await hydrate_conversation_field(field, field_id, config.conversation)
|
|
74
|
+
|
|
75
|
+
elif field_type == FieldTypeName.GENERIC is not None:
|
|
76
|
+
if not config.generic:
|
|
77
|
+
return
|
|
78
|
+
field = cast(Generic, field)
|
|
79
|
+
return await hydrate_generic_field(field, field_id, config.generic)
|
|
80
|
+
|
|
81
|
+
else: # pragma: no cover
|
|
82
|
+
assert_never(field_type)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
async def hydrate_text_field(
|
|
86
|
+
field: Text,
|
|
87
|
+
field_id: FieldId,
|
|
88
|
+
config: hydration_models.TextFieldHydration,
|
|
89
|
+
) -> hydration_models.HydratedTextField:
|
|
90
|
+
select: list[FieldProp] = []
|
|
91
|
+
if config.value:
|
|
92
|
+
select.append(FieldValue())
|
|
93
|
+
if config.extracted_text:
|
|
94
|
+
select.append(FieldText())
|
|
95
|
+
|
|
96
|
+
augmented = await db_augment_text_field(field, field_id, select)
|
|
97
|
+
|
|
98
|
+
hydrated = hydration_models.HydratedTextField(
|
|
99
|
+
id=field_id.full(),
|
|
100
|
+
resource=field_id.rid,
|
|
101
|
+
field_type=FieldTypeName.TEXT,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if config.value and augmented.value:
|
|
105
|
+
hydrated.value = augmented.value
|
|
106
|
+
|
|
107
|
+
if config.extracted_text and augmented.text:
|
|
108
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
|
|
109
|
+
|
|
110
|
+
return hydrated
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
async def hydrate_file_field(
|
|
114
|
+
field: File,
|
|
115
|
+
field_id: FieldId,
|
|
116
|
+
config: hydration_models.FileFieldHydration,
|
|
117
|
+
) -> hydration_models.HydratedFileField:
|
|
118
|
+
select: list[FieldProp] = []
|
|
119
|
+
if config.value:
|
|
120
|
+
select.append(FieldValue())
|
|
121
|
+
if config.extracted_text:
|
|
122
|
+
select.append(FieldText())
|
|
123
|
+
|
|
124
|
+
augmented = await db_augment_file_field(field, field_id, select)
|
|
125
|
+
|
|
126
|
+
hydrated = hydration_models.HydratedFileField(
|
|
127
|
+
id=field_id.full(),
|
|
128
|
+
resource=field_id.rid,
|
|
129
|
+
field_type=FieldTypeName.FILE,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
if config.value and augmented.value:
|
|
133
|
+
hydrated.value = augmented.value
|
|
134
|
+
|
|
135
|
+
if config.extracted_text and augmented.text:
|
|
136
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
|
|
137
|
+
|
|
138
|
+
return hydrated
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
async def hydrate_link_field(
|
|
142
|
+
field: Link,
|
|
143
|
+
field_id: FieldId,
|
|
144
|
+
config: hydration_models.LinkFieldHydration,
|
|
145
|
+
) -> hydration_models.HydratedLinkField:
|
|
146
|
+
select: list[FieldProp] = []
|
|
147
|
+
if config.value:
|
|
148
|
+
select.append(FieldValue())
|
|
149
|
+
if config.extracted_text:
|
|
150
|
+
select.append(FieldText())
|
|
151
|
+
|
|
152
|
+
augmented = await db_augment_link_field(field, field_id, select)
|
|
153
|
+
|
|
154
|
+
hydrated = hydration_models.HydratedLinkField(
|
|
155
|
+
id=field_id.full(),
|
|
156
|
+
resource=field_id.rid,
|
|
157
|
+
field_type=FieldTypeName.LINK,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
if config.value and augmented.value:
|
|
161
|
+
hydrated.value = augmented.value
|
|
162
|
+
|
|
163
|
+
if config.extracted_text and augmented.text:
|
|
164
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
|
|
165
|
+
|
|
166
|
+
return hydrated
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
async def hydrate_conversation_field(
|
|
170
|
+
field: Conversation,
|
|
171
|
+
field_id: FieldId,
|
|
172
|
+
config: hydration_models.ConversationFieldHydration,
|
|
173
|
+
) -> hydration_models.HydratedConversationField:
|
|
174
|
+
select: list[ConversationProp] = []
|
|
175
|
+
if config.value:
|
|
176
|
+
select.append(FieldValue())
|
|
177
|
+
|
|
178
|
+
augmented = await db_augment_conversation_field(field, field_id, select)
|
|
179
|
+
|
|
180
|
+
hydrated = hydration_models.HydratedConversationField(
|
|
181
|
+
id=field_id.full(),
|
|
182
|
+
resource=field_id.rid,
|
|
183
|
+
field_type=FieldTypeName.CONVERSATION,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
if config.value and augmented.value:
|
|
187
|
+
hydrated.value = augmented.value
|
|
188
|
+
|
|
189
|
+
return hydrated
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
async def hydrate_generic_field(
|
|
193
|
+
field: Generic,
|
|
194
|
+
field_id: FieldId,
|
|
195
|
+
config: hydration_models.GenericFieldHydration,
|
|
196
|
+
) -> hydration_models.HydratedGenericField:
|
|
197
|
+
select: list[FieldProp] = []
|
|
198
|
+
if config.value:
|
|
199
|
+
select.append(FieldValue())
|
|
200
|
+
if config.extracted_text:
|
|
201
|
+
select.append(FieldText())
|
|
202
|
+
|
|
203
|
+
augmented = await db_augment_generic_field(field, field_id, select)
|
|
204
|
+
|
|
205
|
+
hydrated = hydration_models.HydratedGenericField(
|
|
206
|
+
id=field_id.full(),
|
|
207
|
+
resource=field_id.rid,
|
|
208
|
+
field_type=FieldTypeName.GENERIC,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
if config.value and augmented.value:
|
|
212
|
+
hydrated.value = augmented.value
|
|
213
|
+
|
|
214
|
+
if config.extracted_text and augmented.text:
|
|
215
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
|
|
216
|
+
|
|
217
|
+
return hydrated
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
import base64
|
|
21
|
+
from typing import cast
|
|
22
|
+
|
|
23
|
+
from typing_extensions import assert_never
|
|
24
|
+
|
|
25
|
+
from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId, ParagraphId
|
|
26
|
+
from nucliadb.ingest.fields.base import Field
|
|
27
|
+
from nucliadb.ingest.fields.file import File
|
|
28
|
+
from nucliadb.search import SERVICE_NAME
|
|
29
|
+
from nucliadb_models.common import FieldTypeName
|
|
30
|
+
from nucliadb_models.search import Image
|
|
31
|
+
from nucliadb_protos import resources_pb2
|
|
32
|
+
from nucliadb_utils.utilities import get_storage
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def paragraph_source_image(
|
|
36
|
+
kbid: str, paragraph_id: ParagraphId, paragraph: resources_pb2.Paragraph
|
|
37
|
+
) -> Image | None:
|
|
38
|
+
"""Certain paragraphs are extracted from images using techniques like OCR or
|
|
39
|
+
inception. If that's the case, return the original image for this paragraph.
|
|
40
|
+
|
|
41
|
+
"""
|
|
42
|
+
source_image = paragraph.representation.reference_file
|
|
43
|
+
if not source_image:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
if paragraph.kind not in (
|
|
47
|
+
resources_pb2.Paragraph.TypeParagraph.OCR,
|
|
48
|
+
resources_pb2.Paragraph.TypeParagraph.INCEPTION,
|
|
49
|
+
):
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
field_id = paragraph_id.field_id
|
|
53
|
+
|
|
54
|
+
# Paragraphs extracted from an image store its original image representation
|
|
55
|
+
# in the reference file. The path is incomplete though, as it's stored in
|
|
56
|
+
# the `generated` folder
|
|
57
|
+
image = await download_image(
|
|
58
|
+
kbid,
|
|
59
|
+
field_id,
|
|
60
|
+
f"generated/{source_image}",
|
|
61
|
+
# XXX: we assume all reference files are PNG images, but this actually
|
|
62
|
+
# depends on learning so it's a dangerous assumption. We should check it
|
|
63
|
+
# by ourselves
|
|
64
|
+
mime_type="image/png",
|
|
65
|
+
)
|
|
66
|
+
return image
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
async def download_image(
|
|
70
|
+
kbid: str, field_id: FieldId, image_path: str, *, mime_type: str
|
|
71
|
+
) -> Image | None:
|
|
72
|
+
storage = await get_storage(service_name=SERVICE_NAME)
|
|
73
|
+
sf = storage.file_extracted(
|
|
74
|
+
kbid,
|
|
75
|
+
field_id.rid,
|
|
76
|
+
field_id.type,
|
|
77
|
+
field_id.key,
|
|
78
|
+
image_path,
|
|
79
|
+
)
|
|
80
|
+
raw_image = (await storage.downloadbytes(sf.bucket, sf.key)).getvalue()
|
|
81
|
+
if not raw_image:
|
|
82
|
+
return None
|
|
83
|
+
return Image(content_type=mime_type, b64encoded=base64.b64encode(raw_image).decode())
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
async def download_page_preview(field: Field, page: int) -> Image | None:
|
|
87
|
+
"""Download a specific page preview for a field and return it as an Image.
|
|
88
|
+
As not all fields have previews, this function can return None.
|
|
89
|
+
|
|
90
|
+
Page previews are uploaded by learning and shared through a known path with.
|
|
91
|
+
nucliadb
|
|
92
|
+
|
|
93
|
+
"""
|
|
94
|
+
field_type = FIELD_TYPE_STR_TO_NAME[field.type]
|
|
95
|
+
|
|
96
|
+
if field_type == FieldTypeName.FILE:
|
|
97
|
+
field = cast(File, field)
|
|
98
|
+
metadata = await field.get_file_extracted_data()
|
|
99
|
+
|
|
100
|
+
if metadata is None:
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
assert page <= len(metadata.file_pages_previews.positions), (
|
|
104
|
+
f"paragraph page number {page} should be less or equal to the total file pages previews {len(metadata.file_pages_previews.positions)}"
|
|
105
|
+
)
|
|
106
|
+
image = await download_image(
|
|
107
|
+
field.kbid,
|
|
108
|
+
field.field_id,
|
|
109
|
+
f"generated/extracted_images_{page}.png",
|
|
110
|
+
mime_type="image/png",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
elif field_type == FieldTypeName.LINK:
|
|
114
|
+
# TODO: in case of links, we want to return the link preview, that is a
|
|
115
|
+
# link converted to PDF and screenshotted
|
|
116
|
+
# REVIEW: link preview is an image or a PDF?
|
|
117
|
+
image = None
|
|
118
|
+
|
|
119
|
+
elif (
|
|
120
|
+
field_type == FieldTypeName.TEXT
|
|
121
|
+
or field_type == FieldTypeName.CONVERSATION
|
|
122
|
+
or field_type == FieldTypeName.GENERIC
|
|
123
|
+
):
|
|
124
|
+
# these fields don't have previews
|
|
125
|
+
image = None
|
|
126
|
+
|
|
127
|
+
else: # pragma: no cover
|
|
128
|
+
assert_never(field_type)
|
|
129
|
+
|
|
130
|
+
return image
|