nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
import asyncio
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
|
|
23
|
+
from nucliadb.common.ids import FieldId, ParagraphId
|
|
24
|
+
from nucliadb.ingest.fields.base import Field
|
|
25
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
26
|
+
from nucliadb.search.augmentor.paragraphs import get_paragraph_text
|
|
27
|
+
from nucliadb.search.search.hydrator.fields import page_preview_id
|
|
28
|
+
from nucliadb.search.search.hydrator.images import paragraph_source_image
|
|
29
|
+
from nucliadb_models import hydration as hydration_models
|
|
30
|
+
from nucliadb_protos import resources_pb2
|
|
31
|
+
from nucliadb_protos.resources_pb2 import FieldComputedMetadata
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ParagraphIndex:
|
|
35
|
+
"""Small helper class to cache field paragraphs and its relations and be
|
|
36
|
+
used as an index.
|
|
37
|
+
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
NEXT = "next"
|
|
41
|
+
PREVIOUS = "previous"
|
|
42
|
+
PARENTS = "parents"
|
|
43
|
+
SIBLINGS = "siblings"
|
|
44
|
+
REPLACEMENTS = "replacements"
|
|
45
|
+
|
|
46
|
+
def __init__(self, field_id: FieldId) -> None:
|
|
47
|
+
self.field_id = field_id
|
|
48
|
+
self.paragraphs: dict[str, resources_pb2.Paragraph] = {}
|
|
49
|
+
self.neighbours: dict[tuple[str, str], str] = {}
|
|
50
|
+
self.related: dict[tuple[str, str], list[str]] = {}
|
|
51
|
+
self._lock = asyncio.Lock()
|
|
52
|
+
self._built = False
|
|
53
|
+
|
|
54
|
+
async def build(self, field: Field):
|
|
55
|
+
"""Build the index if it hasn't been built yet.
|
|
56
|
+
|
|
57
|
+
This function is async-safe, multiple concurrent tasks can ask for a
|
|
58
|
+
built and it'll only be done once
|
|
59
|
+
"""
|
|
60
|
+
if self._built:
|
|
61
|
+
return
|
|
62
|
+
|
|
63
|
+
async with self._lock:
|
|
64
|
+
# double check we haven't built the index meanwhile we waited for the
|
|
65
|
+
# lock
|
|
66
|
+
if self._built:
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
field_metadata = await field.get_field_metadata()
|
|
70
|
+
|
|
71
|
+
if field_metadata is None:
|
|
72
|
+
# field metadata may be still processing. As we want to provide a
|
|
73
|
+
# consistent view, even if it can appear meanwhile we hydrate, we
|
|
74
|
+
# consider we don't have it. We mark the index as built and any
|
|
75
|
+
# paragraph will be found for this field
|
|
76
|
+
self._built = True
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
# REVIEW: this is a CPU-bound code, we may consider running this in an
|
|
80
|
+
# executor to not block the loop
|
|
81
|
+
self._build(field_metadata)
|
|
82
|
+
self._built = True
|
|
83
|
+
|
|
84
|
+
def _build(self, field_metadata: FieldComputedMetadata):
|
|
85
|
+
self.paragraphs.clear()
|
|
86
|
+
self.neighbours.clear()
|
|
87
|
+
self.related.clear()
|
|
88
|
+
|
|
89
|
+
if self.field_id.subfield_id is None:
|
|
90
|
+
field_paragraphs = field_metadata.metadata.paragraphs
|
|
91
|
+
else:
|
|
92
|
+
field_paragraphs = field_metadata.split_metadata[self.field_id.subfield_id].paragraphs
|
|
93
|
+
|
|
94
|
+
previous = None
|
|
95
|
+
for paragraph in field_paragraphs:
|
|
96
|
+
paragraph_id = self.field_id.paragraph_id(paragraph.start, paragraph.end).full()
|
|
97
|
+
self.paragraphs[paragraph_id] = paragraph
|
|
98
|
+
|
|
99
|
+
if previous is not None:
|
|
100
|
+
self.neighbours[(previous, ParagraphIndex.NEXT)] = paragraph_id
|
|
101
|
+
self.neighbours[(paragraph_id, ParagraphIndex.PREVIOUS)] = previous
|
|
102
|
+
previous = paragraph_id
|
|
103
|
+
|
|
104
|
+
self.related[(paragraph_id, ParagraphIndex.PARENTS)] = [
|
|
105
|
+
parent for parent in paragraph.relations.parents
|
|
106
|
+
]
|
|
107
|
+
self.related[(paragraph_id, ParagraphIndex.SIBLINGS)] = [
|
|
108
|
+
sibling for sibling in paragraph.relations.siblings
|
|
109
|
+
]
|
|
110
|
+
self.related[(paragraph_id, ParagraphIndex.REPLACEMENTS)] = [
|
|
111
|
+
replacement for replacement in paragraph.relations.replacements
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
def get(self, paragraph_id: str | ParagraphId) -> resources_pb2.Paragraph | None:
|
|
115
|
+
paragraph_id = str(paragraph_id)
|
|
116
|
+
return self.paragraphs.get(paragraph_id)
|
|
117
|
+
|
|
118
|
+
def previous(self, paragraph_id: str | ParagraphId) -> str | None:
|
|
119
|
+
paragraph_id = str(paragraph_id)
|
|
120
|
+
return self.neighbours.get((paragraph_id, ParagraphIndex.PREVIOUS))
|
|
121
|
+
|
|
122
|
+
def next(self, paragraph_id: str | ParagraphId) -> str | None:
|
|
123
|
+
paragraph_id = str(paragraph_id)
|
|
124
|
+
return self.neighbours.get((paragraph_id, ParagraphIndex.NEXT))
|
|
125
|
+
|
|
126
|
+
def n_previous(self, paragraph_id: str | ParagraphId, count: int = 1) -> list[str]:
|
|
127
|
+
assert count >= 1, f"can't find negative previous {count}"
|
|
128
|
+
paragraph_id = str(paragraph_id)
|
|
129
|
+
previous: list[str] = []
|
|
130
|
+
current_id = paragraph_id
|
|
131
|
+
for _ in range(count):
|
|
132
|
+
previous_id = self.previous(current_id)
|
|
133
|
+
if previous_id is None:
|
|
134
|
+
# we've reached the first paragraph
|
|
135
|
+
break
|
|
136
|
+
previous.insert(0, previous_id)
|
|
137
|
+
current_id = previous_id
|
|
138
|
+
return previous
|
|
139
|
+
|
|
140
|
+
def n_next(self, paragraph_id: str | ParagraphId, count: int = 1) -> list[str]:
|
|
141
|
+
assert count >= 1, f"can't find negative nexts {count}"
|
|
142
|
+
paragraph_id = str(paragraph_id)
|
|
143
|
+
nexts = []
|
|
144
|
+
current_id = paragraph_id
|
|
145
|
+
for _ in range(count):
|
|
146
|
+
next_id = self.next(current_id)
|
|
147
|
+
if next_id is None:
|
|
148
|
+
# we've reached the last paragraph
|
|
149
|
+
break
|
|
150
|
+
current_id = next_id
|
|
151
|
+
nexts.append(next_id)
|
|
152
|
+
return nexts
|
|
153
|
+
|
|
154
|
+
def parents(self, paragraph_id: str | ParagraphId) -> list[str]:
|
|
155
|
+
paragraph_id = str(paragraph_id)
|
|
156
|
+
return self.related.get((paragraph_id, ParagraphIndex.PARENTS), [])
|
|
157
|
+
|
|
158
|
+
def siblings(self, paragraph_id: str | ParagraphId) -> list[str]:
|
|
159
|
+
paragraph_id = str(paragraph_id)
|
|
160
|
+
return self.related.get((paragraph_id, ParagraphIndex.SIBLINGS), [])
|
|
161
|
+
|
|
162
|
+
def replacements(self, paragraph_id: str | ParagraphId) -> list[str]:
|
|
163
|
+
paragraph_id = str(paragraph_id)
|
|
164
|
+
return self.related.get((paragraph_id, ParagraphIndex.REPLACEMENTS), [])
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@dataclass
|
|
168
|
+
class ExtraParagraphHydration:
|
|
169
|
+
field_page: int | None
|
|
170
|
+
field_table_page: int | None
|
|
171
|
+
related_paragraph_ids: list[ParagraphId]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
async def hydrate_paragraph(
|
|
175
|
+
resource: Resource,
|
|
176
|
+
field: Field,
|
|
177
|
+
paragraph_id: ParagraphId,
|
|
178
|
+
config: hydration_models.ParagraphHydration,
|
|
179
|
+
field_paragraphs_index: ParagraphIndex,
|
|
180
|
+
) -> tuple[hydration_models.HydratedParagraph, ExtraParagraphHydration]:
|
|
181
|
+
"""Hydrate a paragraph and return the extra hydration to built a coherent
|
|
182
|
+
hydration around this paragraph.
|
|
183
|
+
|
|
184
|
+
Although the resource and field exist, the paragraph doesn't necessarily
|
|
185
|
+
need to be a real one in the paragraph metadata, it can be made-up to
|
|
186
|
+
include more or less text than the originally extracted.
|
|
187
|
+
|
|
188
|
+
"""
|
|
189
|
+
kbid = resource.kbid
|
|
190
|
+
|
|
191
|
+
hydrated = hydration_models.HydratedParagraph(
|
|
192
|
+
id=paragraph_id.full(),
|
|
193
|
+
field=paragraph_id.field_id.full(),
|
|
194
|
+
resource=paragraph_id.rid,
|
|
195
|
+
)
|
|
196
|
+
extra_hydration = ExtraParagraphHydration(
|
|
197
|
+
field_page=None, field_table_page=None, related_paragraph_ids=[]
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
if config.text:
|
|
201
|
+
text = await get_paragraph_text(field, paragraph_id)
|
|
202
|
+
hydrated.text = text
|
|
203
|
+
|
|
204
|
+
requires_paragraph_metadata = config.image or config.table or config.page or config.related
|
|
205
|
+
if requires_paragraph_metadata:
|
|
206
|
+
await field_paragraphs_index.build(field)
|
|
207
|
+
paragraph = field_paragraphs_index.get(paragraph_id)
|
|
208
|
+
if paragraph is not None:
|
|
209
|
+
# otherwise, this is a fake paragraph. We can't hydrate anything else here
|
|
210
|
+
|
|
211
|
+
if config.related:
|
|
212
|
+
if config.related.neighbours is not None:
|
|
213
|
+
before = config.related.neighbours.before
|
|
214
|
+
after = config.related.neighbours.after
|
|
215
|
+
else:
|
|
216
|
+
before, after = None, None
|
|
217
|
+
|
|
218
|
+
hydrated.related, related_ids = await related_paragraphs_refs(
|
|
219
|
+
paragraph_id,
|
|
220
|
+
field_paragraphs_index,
|
|
221
|
+
neighbours_before=before,
|
|
222
|
+
neighbours_after=after,
|
|
223
|
+
parents=config.related.parents or False,
|
|
224
|
+
siblings=config.related.siblings or False,
|
|
225
|
+
replacements=config.related.replacements or False,
|
|
226
|
+
)
|
|
227
|
+
extra_hydration.related_paragraph_ids = related_ids
|
|
228
|
+
|
|
229
|
+
if config.image:
|
|
230
|
+
hydrated.image = hydration_models.HydratedParagraphImage()
|
|
231
|
+
|
|
232
|
+
if config.image.source_image:
|
|
233
|
+
hydrated.image.source_image = await paragraph_source_image(
|
|
234
|
+
kbid, paragraph_id, paragraph
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if config.page:
|
|
238
|
+
if hydrated.page is None:
|
|
239
|
+
hydrated.page = hydration_models.HydratedParagraphPage()
|
|
240
|
+
|
|
241
|
+
if config.page.page_with_visual:
|
|
242
|
+
if paragraph.page.page_with_visual:
|
|
243
|
+
# Paragraphs can be found on pages with visual content. In this
|
|
244
|
+
# case, we want to return the preview of the paragraph page as
|
|
245
|
+
# an image
|
|
246
|
+
page_number = paragraph.page.page
|
|
247
|
+
# TODO: what should I do if I later find there's no page in the DB?
|
|
248
|
+
hydrated.page.page_preview_ref = page_preview_id(page_number)
|
|
249
|
+
extra_hydration.field_page = page_number
|
|
250
|
+
|
|
251
|
+
if config.table:
|
|
252
|
+
if hydrated.table is None:
|
|
253
|
+
hydrated.table = hydration_models.HydratedParagraphTable()
|
|
254
|
+
|
|
255
|
+
if config.table.table_page_preview:
|
|
256
|
+
if paragraph.representation.is_a_table:
|
|
257
|
+
# When a paragraph comes with a table and table hydration is
|
|
258
|
+
# enabled, we want to return the image representing that table.
|
|
259
|
+
# Ideally we should hydrate the paragraph reference_file, but
|
|
260
|
+
# table screenshots are not always perfect so we prefer to use
|
|
261
|
+
# the page preview. If at some point the table images are good
|
|
262
|
+
# enough, it'd be better to use those
|
|
263
|
+
page_number = paragraph.page.page
|
|
264
|
+
hydrated.table.page_preview_ref = page_preview_id(page_number)
|
|
265
|
+
extra_hydration.field_table_page = page_number
|
|
266
|
+
|
|
267
|
+
return hydrated, extra_hydration
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
async def related_paragraphs_refs(
|
|
271
|
+
paragraph_id: ParagraphId,
|
|
272
|
+
index: ParagraphIndex,
|
|
273
|
+
*,
|
|
274
|
+
neighbours_before: int | None = None,
|
|
275
|
+
neighbours_after: int | None = None,
|
|
276
|
+
parents: bool = False,
|
|
277
|
+
siblings: bool = False,
|
|
278
|
+
replacements: bool = False,
|
|
279
|
+
) -> tuple[hydration_models.RelatedParagraphRefs, list[ParagraphId]]:
|
|
280
|
+
"""Compute the related paragraph references for a specific `paragraph_id`
|
|
281
|
+
and return them with the plain list of unique related paragraphs (to
|
|
282
|
+
facilitate work to the caller).
|
|
283
|
+
|
|
284
|
+
"""
|
|
285
|
+
hydrated = hydration_models.RelatedParagraphRefs()
|
|
286
|
+
related = set()
|
|
287
|
+
|
|
288
|
+
if neighbours_before or neighbours_after:
|
|
289
|
+
hydrated.neighbours = hydration_models.RelatedNeighbourParagraphRefs()
|
|
290
|
+
|
|
291
|
+
if neighbours_before is not None:
|
|
292
|
+
hydrated.neighbours.before = []
|
|
293
|
+
if neighbours_before > 0:
|
|
294
|
+
for previous_id in index.n_previous(paragraph_id, neighbours_before):
|
|
295
|
+
hydrated.neighbours.before.insert(0, previous_id)
|
|
296
|
+
related.add(ParagraphId.from_string(previous_id))
|
|
297
|
+
|
|
298
|
+
if neighbours_after is not None:
|
|
299
|
+
hydrated.neighbours.after = []
|
|
300
|
+
if neighbours_after > 0:
|
|
301
|
+
for next_id in index.n_next(paragraph_id, neighbours_after):
|
|
302
|
+
hydrated.neighbours.after.append(next_id)
|
|
303
|
+
related.add(ParagraphId.from_string(next_id))
|
|
304
|
+
|
|
305
|
+
if parents:
|
|
306
|
+
hydrated.parents = []
|
|
307
|
+
for parent_id in index.parents(paragraph_id):
|
|
308
|
+
hydrated.parents.append(parent_id)
|
|
309
|
+
related.add(ParagraphId.from_string(parent_id))
|
|
310
|
+
|
|
311
|
+
if siblings:
|
|
312
|
+
hydrated.siblings = []
|
|
313
|
+
for sibling_id in index.siblings(paragraph_id):
|
|
314
|
+
hydrated.siblings.append(sibling_id)
|
|
315
|
+
related.add(ParagraphId.from_string(sibling_id))
|
|
316
|
+
|
|
317
|
+
if replacements:
|
|
318
|
+
hydrated.replacements = []
|
|
319
|
+
for replacement_id in index.replacements(paragraph_id):
|
|
320
|
+
hydrated.replacements.append(replacement_id)
|
|
321
|
+
related.add(ParagraphId.from_string(replacement_id))
|
|
322
|
+
|
|
323
|
+
return hydrated, list(related)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
4
|
+
#
|
|
5
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
6
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
7
|
+
#
|
|
8
|
+
# AGPL:
|
|
9
|
+
# This program is free software: you can redistribute it and/or modify
|
|
10
|
+
# it under the terms of the GNU Affero General Public License as
|
|
11
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
12
|
+
# License, or (at your option) any later version.
|
|
13
|
+
#
|
|
14
|
+
# This program is distributed in the hope that it will be useful,
|
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
17
|
+
# GNU Affero General Public License for more details.
|
|
18
|
+
#
|
|
19
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
20
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
21
|
+
#
|
|
22
|
+
|
|
23
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
24
|
+
from nucliadb.models.internal.augment import (
|
|
25
|
+
ResourceOrigin,
|
|
26
|
+
ResourceProp,
|
|
27
|
+
ResourceSecurity,
|
|
28
|
+
ResourceSummary,
|
|
29
|
+
ResourceTitle,
|
|
30
|
+
)
|
|
31
|
+
from nucliadb.search.augmentor.resources import db_augment_resource
|
|
32
|
+
from nucliadb_models import hydration as hydration_models
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def hydrate_resource(
|
|
36
|
+
resource: Resource, rid: str, config: hydration_models.ResourceHydration
|
|
37
|
+
) -> hydration_models.HydratedResource:
|
|
38
|
+
basic = await resource.get_basic()
|
|
39
|
+
|
|
40
|
+
slug = basic.slug
|
|
41
|
+
hydrated = hydration_models.HydratedResource(id=rid, slug=slug)
|
|
42
|
+
|
|
43
|
+
select: list[ResourceProp] = []
|
|
44
|
+
if config.title:
|
|
45
|
+
select.append(ResourceTitle())
|
|
46
|
+
if config.summary:
|
|
47
|
+
select.append(ResourceSummary())
|
|
48
|
+
if config.origin:
|
|
49
|
+
select.append(ResourceOrigin())
|
|
50
|
+
if config.security:
|
|
51
|
+
select.append(ResourceSecurity())
|
|
52
|
+
|
|
53
|
+
augmented = await db_augment_resource(resource, select)
|
|
54
|
+
|
|
55
|
+
hydrated.title = augmented.title
|
|
56
|
+
hydrated.summary = augmented.summary
|
|
57
|
+
hydrated.origin = augmented.origin
|
|
58
|
+
hydrated.security = augmented.security
|
|
59
|
+
|
|
60
|
+
return hydrated
|
|
@@ -19,10 +19,10 @@
|
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
21
|
from base64 import b64encode
|
|
22
|
-
from typing import Optional
|
|
23
22
|
|
|
24
23
|
from nucliadb.common import datamanagers
|
|
25
24
|
from nucliadb.ingest.fields.base import Field
|
|
25
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
26
26
|
from nucliadb.search.predict_models import (
|
|
27
27
|
FieldInfo,
|
|
28
28
|
NameOperationFilter,
|
|
@@ -40,8 +40,8 @@ async def run_agents(
|
|
|
40
40
|
kbid: str,
|
|
41
41
|
rid: str,
|
|
42
42
|
user_id: str,
|
|
43
|
-
filters:
|
|
44
|
-
agent_ids:
|
|
43
|
+
filters: list[AgentsFilter] | None = None,
|
|
44
|
+
agent_ids: list[str] | None = None,
|
|
45
45
|
) -> RunAgentsResponse:
|
|
46
46
|
fields = await fetch_resource_fields(kbid, rid)
|
|
47
47
|
|
|
@@ -56,7 +56,7 @@ async def run_agents(
|
|
|
56
56
|
return await predict.run_agents(kbid, item)
|
|
57
57
|
|
|
58
58
|
|
|
59
|
-
def _parse_filters(filters:
|
|
59
|
+
def _parse_filters(filters: list[AgentsFilter] | None) -> list[NameOperationFilter] | None:
|
|
60
60
|
if filters is None:
|
|
61
61
|
return None
|
|
62
62
|
return [
|
|
@@ -69,7 +69,7 @@ def _parse_filters(filters: Optional[list[AgentsFilter]]) -> Optional[list[NameO
|
|
|
69
69
|
|
|
70
70
|
async def fetch_resource_fields(kbid: str, rid: str) -> list[FieldInfo]:
|
|
71
71
|
async with datamanagers.with_ro_transaction() as txn:
|
|
72
|
-
resource = await
|
|
72
|
+
resource = await Resource.get(txn, kbid=kbid, rid=rid)
|
|
73
73
|
if resource is None:
|
|
74
74
|
raise ResourceNotFoundError()
|
|
75
75
|
fields = await resource.get_fields(force=True)
|