nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -17,33 +17,11 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
import asyncio
|
|
21
|
-
import logging
|
|
22
|
-
from contextlib import AsyncExitStack
|
|
23
|
-
from typing import Optional
|
|
24
|
-
|
|
25
20
|
from pydantic import BaseModel
|
|
26
21
|
|
|
27
|
-
from nucliadb.common.external_index_providers.base import TextBlockMatch
|
|
28
|
-
from nucliadb.common.ids import FieldId
|
|
29
|
-
from nucliadb.common.maindb.utils import get_driver
|
|
30
|
-
from nucliadb.ingest.serialize import managed_serialize
|
|
31
|
-
from nucliadb.search.search import cache
|
|
32
|
-
from nucliadb.search.search.paragraphs import get_paragraph_text
|
|
33
22
|
from nucliadb_models.common import FieldTypeName
|
|
34
|
-
from nucliadb_models.resource import ExtractedDataTypeName
|
|
35
|
-
from nucliadb_models.search import
|
|
36
|
-
FindParagraph,
|
|
37
|
-
ResourceProperties,
|
|
38
|
-
)
|
|
39
|
-
from nucliadb_telemetry.metrics import Observer
|
|
40
|
-
from nucliadb_utils import const
|
|
41
|
-
from nucliadb_utils.asyncio_utils import ConcurrentRunner
|
|
42
|
-
from nucliadb_utils.utilities import has_feature
|
|
43
|
-
|
|
44
|
-
logger = logging.getLogger(__name__)
|
|
45
|
-
|
|
46
|
-
hydrator_observer = Observer("hydrator", labels={"type": ""})
|
|
23
|
+
from nucliadb_models.resource import ExtractedDataTypeName
|
|
24
|
+
from nucliadb_models.search import ResourceProperties
|
|
47
25
|
|
|
48
26
|
|
|
49
27
|
class ResourceHydrationOptions(BaseModel):
|
|
@@ -65,134 +43,7 @@ class TextBlockHydrationOptions(BaseModel):
|
|
|
65
43
|
highlight: bool = False
|
|
66
44
|
|
|
67
45
|
# list of exact matches to highlight
|
|
68
|
-
ematches:
|
|
46
|
+
ematches: list[str] | None = None
|
|
69
47
|
|
|
70
48
|
# If true, only hydrate the text block if its text is not already populated
|
|
71
49
|
only_hydrate_empty: bool = False
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
@hydrator_observer.wrap({"type": "resource_text"})
|
|
75
|
-
async def hydrate_resource_text(
|
|
76
|
-
kbid: str, rid: str, *, max_concurrent_tasks: int
|
|
77
|
-
) -> list[tuple[FieldId, str]]:
|
|
78
|
-
resource = await cache.get_resource(kbid, rid)
|
|
79
|
-
if resource is None: # pragma: no cover
|
|
80
|
-
return []
|
|
81
|
-
|
|
82
|
-
# Schedule the extraction of the text of each field in the resource
|
|
83
|
-
async with get_driver().ro_transaction() as txn:
|
|
84
|
-
resource.txn = txn
|
|
85
|
-
runner = ConcurrentRunner(max_tasks=max_concurrent_tasks)
|
|
86
|
-
for field_type, field_key in await resource.get_fields(force=True):
|
|
87
|
-
field_id = FieldId.from_pb(rid, field_type, field_key)
|
|
88
|
-
runner.schedule(hydrate_field_text(kbid, field_id))
|
|
89
|
-
|
|
90
|
-
# Include the summary aswell
|
|
91
|
-
runner.schedule(hydrate_field_text(kbid, FieldId(rid=rid, type="a", key="summary")))
|
|
92
|
-
|
|
93
|
-
# Wait for the results
|
|
94
|
-
field_extracted_texts = await runner.wait()
|
|
95
|
-
|
|
96
|
-
return [text for text in field_extracted_texts if text is not None]
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
@hydrator_observer.wrap({"type": "resource_metadata"})
|
|
100
|
-
async def hydrate_resource_metadata(
|
|
101
|
-
kbid: str,
|
|
102
|
-
resource_id: str,
|
|
103
|
-
options: ResourceHydrationOptions,
|
|
104
|
-
*,
|
|
105
|
-
concurrency_control: Optional[asyncio.Semaphore] = None,
|
|
106
|
-
service_name: Optional[str] = None,
|
|
107
|
-
) -> Optional[Resource]:
|
|
108
|
-
"""Fetch resource metadata and return it serialized."""
|
|
109
|
-
show = options.show
|
|
110
|
-
extracted = options.extracted
|
|
111
|
-
|
|
112
|
-
if ResourceProperties.EXTRACTED in show and has_feature(
|
|
113
|
-
const.Features.IGNORE_EXTRACTED_IN_SEARCH, context={"kbid": kbid}, default=False
|
|
114
|
-
):
|
|
115
|
-
# Returning extracted metadata in search results is deprecated and this flag
|
|
116
|
-
# will be set to True for all KBs in the future.
|
|
117
|
-
show.remove(ResourceProperties.EXTRACTED)
|
|
118
|
-
extracted = []
|
|
119
|
-
|
|
120
|
-
async with AsyncExitStack() as stack:
|
|
121
|
-
if concurrency_control is not None:
|
|
122
|
-
await stack.enter_async_context(concurrency_control)
|
|
123
|
-
|
|
124
|
-
async with get_driver().ro_transaction() as ro_txn:
|
|
125
|
-
serialized_resource = await managed_serialize(
|
|
126
|
-
txn=ro_txn,
|
|
127
|
-
kbid=kbid,
|
|
128
|
-
rid=resource_id,
|
|
129
|
-
show=show,
|
|
130
|
-
field_type_filter=options.field_type_filter,
|
|
131
|
-
extracted=extracted,
|
|
132
|
-
service_name=service_name,
|
|
133
|
-
)
|
|
134
|
-
if serialized_resource is None:
|
|
135
|
-
logger.warning(
|
|
136
|
-
"Resource not found in database", extra={"kbid": kbid, "rid": resource_id}
|
|
137
|
-
)
|
|
138
|
-
return serialized_resource
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
@hydrator_observer.wrap({"type": "field_text"})
|
|
142
|
-
async def hydrate_field_text(
|
|
143
|
-
kbid: str,
|
|
144
|
-
field_id: FieldId,
|
|
145
|
-
) -> Optional[tuple[FieldId, str]]:
|
|
146
|
-
extracted_text_pb = await cache.get_extracted_text_from_field_id(kbid, field_id)
|
|
147
|
-
if extracted_text_pb is None: # pragma: no cover
|
|
148
|
-
return None
|
|
149
|
-
|
|
150
|
-
if field_id.subfield_id:
|
|
151
|
-
return field_id, extracted_text_pb.split_text[field_id.subfield_id]
|
|
152
|
-
else:
|
|
153
|
-
return field_id, extracted_text_pb.text
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
@hydrator_observer.wrap({"type": "text_block"})
|
|
157
|
-
async def hydrate_text_block(
|
|
158
|
-
kbid: str,
|
|
159
|
-
text_block: TextBlockMatch,
|
|
160
|
-
options: TextBlockHydrationOptions,
|
|
161
|
-
*,
|
|
162
|
-
concurrency_control: Optional[asyncio.Semaphore] = None,
|
|
163
|
-
) -> TextBlockMatch:
|
|
164
|
-
"""Given a `text_block`, fetch its corresponding text, modify and return the
|
|
165
|
-
`text_block` object.
|
|
166
|
-
|
|
167
|
-
"""
|
|
168
|
-
if options.only_hydrate_empty and text_block.text:
|
|
169
|
-
return text_block
|
|
170
|
-
async with AsyncExitStack() as stack:
|
|
171
|
-
if concurrency_control is not None:
|
|
172
|
-
await stack.enter_async_context(concurrency_control)
|
|
173
|
-
|
|
174
|
-
text_block.text = await get_paragraph_text(
|
|
175
|
-
kbid=kbid,
|
|
176
|
-
paragraph_id=text_block.paragraph_id,
|
|
177
|
-
highlight=options.highlight,
|
|
178
|
-
matches=[], # TODO: this was never implemented
|
|
179
|
-
ematches=options.ematches,
|
|
180
|
-
)
|
|
181
|
-
return text_block
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
def text_block_to_find_paragraph(text_block: TextBlockMatch) -> FindParagraph:
|
|
185
|
-
return FindParagraph(
|
|
186
|
-
id=text_block.paragraph_id.full(),
|
|
187
|
-
text=text_block.text or "",
|
|
188
|
-
score=text_block.score,
|
|
189
|
-
score_type=text_block.score_type,
|
|
190
|
-
order=text_block.order,
|
|
191
|
-
labels=text_block.paragraph_labels,
|
|
192
|
-
fuzzy_result=text_block.fuzzy_search,
|
|
193
|
-
is_a_table=text_block.is_a_table,
|
|
194
|
-
reference=text_block.representation_file,
|
|
195
|
-
page_with_visual=text_block.page_with_visual,
|
|
196
|
-
position=text_block.position,
|
|
197
|
-
relevant_relations=text_block.relevant_relations,
|
|
198
|
-
)
|
|
@@ -17,12 +17,25 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
+
from typing import cast
|
|
20
21
|
|
|
22
|
+
from typing_extensions import assert_never
|
|
21
23
|
|
|
22
24
|
from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId
|
|
23
|
-
from nucliadb.
|
|
24
|
-
from nucliadb.ingest.
|
|
25
|
-
from nucliadb.
|
|
25
|
+
from nucliadb.ingest.fields.base import Field
|
|
26
|
+
from nucliadb.ingest.fields.conversation import Conversation
|
|
27
|
+
from nucliadb.ingest.fields.file import File
|
|
28
|
+
from nucliadb.ingest.fields.generic import Generic
|
|
29
|
+
from nucliadb.ingest.fields.link import Link
|
|
30
|
+
from nucliadb.ingest.fields.text import Text
|
|
31
|
+
from nucliadb.models.internal.augment import ConversationProp, FieldProp, FieldText, FieldValue
|
|
32
|
+
from nucliadb.search.augmentor.fields import (
|
|
33
|
+
db_augment_conversation_field,
|
|
34
|
+
db_augment_file_field,
|
|
35
|
+
db_augment_generic_field,
|
|
36
|
+
db_augment_link_field,
|
|
37
|
+
db_augment_text_field,
|
|
38
|
+
)
|
|
26
39
|
from nucliadb_models import hydration as hydration_models
|
|
27
40
|
from nucliadb_models.common import FieldTypeName
|
|
28
41
|
|
|
@@ -32,144 +45,173 @@ def page_preview_id(page_number: int) -> str:
|
|
|
32
45
|
return f"{page_number}"
|
|
33
46
|
|
|
34
47
|
|
|
35
|
-
async def hydrate_field(
|
|
48
|
+
async def hydrate_field(field: Field, field_id: FieldId, config: hydration_models.FieldHydration):
|
|
36
49
|
field_type = FIELD_TYPE_STR_TO_NAME[field_id.type]
|
|
37
50
|
|
|
38
51
|
if field_type == FieldTypeName.TEXT:
|
|
39
52
|
if not config.text is not None:
|
|
40
53
|
return
|
|
41
|
-
|
|
54
|
+
field = cast(Text, field)
|
|
55
|
+
return await hydrate_text_field(field, field_id, config.text)
|
|
42
56
|
|
|
43
57
|
elif field_type == FieldTypeName.FILE is not None:
|
|
44
58
|
if not config.file:
|
|
45
59
|
return
|
|
46
|
-
|
|
60
|
+
field = cast(File, field)
|
|
61
|
+
return await hydrate_file_field(field, field_id, config.file)
|
|
47
62
|
|
|
48
63
|
elif field_type == FieldTypeName.LINK is not None:
|
|
49
64
|
if not config.link:
|
|
50
65
|
return
|
|
51
|
-
|
|
66
|
+
field = cast(Link, field)
|
|
67
|
+
return await hydrate_link_field(field, field_id, config.link)
|
|
52
68
|
|
|
53
69
|
elif field_type == FieldTypeName.CONVERSATION is not None:
|
|
54
70
|
if not config.conversation:
|
|
55
71
|
return
|
|
56
|
-
|
|
72
|
+
field = cast(Conversation, field)
|
|
73
|
+
return await hydrate_conversation_field(field, field_id, config.conversation)
|
|
57
74
|
|
|
58
75
|
elif field_type == FieldTypeName.GENERIC is not None:
|
|
59
76
|
if not config.generic:
|
|
60
77
|
return
|
|
61
|
-
|
|
78
|
+
field = cast(Generic, field)
|
|
79
|
+
return await hydrate_generic_field(field, field_id, config.generic)
|
|
62
80
|
|
|
63
81
|
else: # pragma: no cover
|
|
64
|
-
|
|
65
|
-
# that is, if we are missing some ifs
|
|
66
|
-
_a: int = "a"
|
|
82
|
+
assert_never(field_type)
|
|
67
83
|
|
|
68
84
|
|
|
69
85
|
async def hydrate_text_field(
|
|
70
|
-
|
|
86
|
+
field: Text,
|
|
71
87
|
field_id: FieldId,
|
|
72
88
|
config: hydration_models.TextFieldHydration,
|
|
73
89
|
) -> hydration_models.HydratedTextField:
|
|
90
|
+
select: list[FieldProp] = []
|
|
91
|
+
if config.value:
|
|
92
|
+
select.append(FieldValue())
|
|
93
|
+
if config.extracted_text:
|
|
94
|
+
select.append(FieldText())
|
|
95
|
+
|
|
96
|
+
augmented = await db_augment_text_field(field, field_id, select)
|
|
97
|
+
|
|
74
98
|
hydrated = hydration_models.HydratedTextField(
|
|
75
99
|
id=field_id.full(),
|
|
76
100
|
resource=field_id.rid,
|
|
77
101
|
field_type=FieldTypeName.TEXT,
|
|
78
102
|
)
|
|
79
103
|
|
|
80
|
-
if config.
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
104
|
+
if config.value and augmented.value:
|
|
105
|
+
hydrated.value = augmented.value
|
|
106
|
+
|
|
107
|
+
if config.extracted_text and augmented.text:
|
|
108
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
|
|
85
109
|
|
|
86
110
|
return hydrated
|
|
87
111
|
|
|
88
112
|
|
|
89
113
|
async def hydrate_file_field(
|
|
90
|
-
|
|
114
|
+
field: File,
|
|
91
115
|
field_id: FieldId,
|
|
92
116
|
config: hydration_models.FileFieldHydration,
|
|
93
117
|
) -> hydration_models.HydratedFileField:
|
|
118
|
+
select: list[FieldProp] = []
|
|
119
|
+
if config.value:
|
|
120
|
+
select.append(FieldValue())
|
|
121
|
+
if config.extracted_text:
|
|
122
|
+
select.append(FieldText())
|
|
123
|
+
|
|
124
|
+
augmented = await db_augment_file_field(field, field_id, select)
|
|
125
|
+
|
|
94
126
|
hydrated = hydration_models.HydratedFileField(
|
|
95
127
|
id=field_id.full(),
|
|
96
128
|
resource=field_id.rid,
|
|
97
129
|
field_type=FieldTypeName.FILE,
|
|
98
130
|
)
|
|
99
131
|
|
|
100
|
-
if config.value:
|
|
101
|
-
|
|
102
|
-
value = await field.get_value()
|
|
103
|
-
hydrated.value = from_proto.field_file(value)
|
|
132
|
+
if config.value and augmented.value:
|
|
133
|
+
hydrated.value = augmented.value
|
|
104
134
|
|
|
105
|
-
if config.extracted_text:
|
|
106
|
-
|
|
107
|
-
if field_text is not None:
|
|
108
|
-
(_, text) = field_text
|
|
109
|
-
hydrated.extracted = hydration_models.FieldExtractedData(text=text)
|
|
135
|
+
if config.extracted_text and augmented.text:
|
|
136
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
|
|
110
137
|
|
|
111
138
|
return hydrated
|
|
112
139
|
|
|
113
140
|
|
|
114
141
|
async def hydrate_link_field(
|
|
115
|
-
|
|
142
|
+
field: Link,
|
|
116
143
|
field_id: FieldId,
|
|
117
144
|
config: hydration_models.LinkFieldHydration,
|
|
118
145
|
) -> hydration_models.HydratedLinkField:
|
|
146
|
+
select: list[FieldProp] = []
|
|
147
|
+
if config.value:
|
|
148
|
+
select.append(FieldValue())
|
|
149
|
+
if config.extracted_text:
|
|
150
|
+
select.append(FieldText())
|
|
151
|
+
|
|
152
|
+
augmented = await db_augment_link_field(field, field_id, select)
|
|
153
|
+
|
|
119
154
|
hydrated = hydration_models.HydratedLinkField(
|
|
120
155
|
id=field_id.full(),
|
|
121
156
|
resource=field_id.rid,
|
|
122
157
|
field_type=FieldTypeName.LINK,
|
|
123
158
|
)
|
|
124
159
|
|
|
125
|
-
if config.value:
|
|
126
|
-
|
|
127
|
-
value = await field.get_value()
|
|
128
|
-
hydrated.value = from_proto.field_link(value)
|
|
160
|
+
if config.value and augmented.value:
|
|
161
|
+
hydrated.value = augmented.value
|
|
129
162
|
|
|
130
|
-
if config.extracted_text:
|
|
131
|
-
|
|
132
|
-
if field_text is not None:
|
|
133
|
-
(_, text) = field_text
|
|
134
|
-
hydrated.extracted = hydration_models.FieldExtractedData(text=text)
|
|
163
|
+
if config.extracted_text and augmented.text:
|
|
164
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
|
|
135
165
|
|
|
136
166
|
return hydrated
|
|
137
167
|
|
|
138
168
|
|
|
139
169
|
async def hydrate_conversation_field(
|
|
140
|
-
|
|
170
|
+
field: Conversation,
|
|
141
171
|
field_id: FieldId,
|
|
142
172
|
config: hydration_models.ConversationFieldHydration,
|
|
143
173
|
) -> hydration_models.HydratedConversationField:
|
|
174
|
+
select: list[ConversationProp] = []
|
|
175
|
+
if config.value:
|
|
176
|
+
select.append(FieldValue())
|
|
177
|
+
|
|
178
|
+
augmented = await db_augment_conversation_field(field, field_id, select)
|
|
179
|
+
|
|
144
180
|
hydrated = hydration_models.HydratedConversationField(
|
|
145
181
|
id=field_id.full(),
|
|
146
182
|
resource=field_id.rid,
|
|
147
183
|
field_type=FieldTypeName.CONVERSATION,
|
|
148
184
|
)
|
|
149
|
-
|
|
185
|
+
|
|
186
|
+
if config.value and augmented.value:
|
|
187
|
+
hydrated.value = augmented.value
|
|
188
|
+
|
|
150
189
|
return hydrated
|
|
151
190
|
|
|
152
191
|
|
|
153
192
|
async def hydrate_generic_field(
|
|
154
|
-
|
|
193
|
+
field: Generic,
|
|
155
194
|
field_id: FieldId,
|
|
156
195
|
config: hydration_models.GenericFieldHydration,
|
|
157
196
|
) -> hydration_models.HydratedGenericField:
|
|
197
|
+
select: list[FieldProp] = []
|
|
198
|
+
if config.value:
|
|
199
|
+
select.append(FieldValue())
|
|
200
|
+
if config.extracted_text:
|
|
201
|
+
select.append(FieldText())
|
|
202
|
+
|
|
203
|
+
augmented = await db_augment_generic_field(field, field_id, select)
|
|
204
|
+
|
|
158
205
|
hydrated = hydration_models.HydratedGenericField(
|
|
159
206
|
id=field_id.full(),
|
|
160
207
|
resource=field_id.rid,
|
|
161
208
|
field_type=FieldTypeName.GENERIC,
|
|
162
209
|
)
|
|
163
210
|
|
|
164
|
-
if config.value:
|
|
165
|
-
|
|
166
|
-
value = await field.get_value()
|
|
167
|
-
hydrated.value = value
|
|
211
|
+
if config.value and augmented.value:
|
|
212
|
+
hydrated.value = augmented.value
|
|
168
213
|
|
|
169
|
-
if config.extracted_text:
|
|
170
|
-
|
|
171
|
-
if field_text is not None:
|
|
172
|
-
(_, text) = field_text
|
|
173
|
-
hydrated.extracted = hydration_models.FieldExtractedData(text=text)
|
|
214
|
+
if config.extracted_text and augmented.text:
|
|
215
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
|
|
174
216
|
|
|
175
217
|
return hydrated
|
|
@@ -18,7 +18,9 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import base64
|
|
21
|
-
from typing import
|
|
21
|
+
from typing import cast
|
|
22
|
+
|
|
23
|
+
from typing_extensions import assert_never
|
|
22
24
|
|
|
23
25
|
from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId, ParagraphId
|
|
24
26
|
from nucliadb.ingest.fields.base import Field
|
|
@@ -32,7 +34,7 @@ from nucliadb_utils.utilities import get_storage
|
|
|
32
34
|
|
|
33
35
|
async def paragraph_source_image(
|
|
34
36
|
kbid: str, paragraph_id: ParagraphId, paragraph: resources_pb2.Paragraph
|
|
35
|
-
) ->
|
|
37
|
+
) -> Image | None:
|
|
36
38
|
"""Certain paragraphs are extracted from images using techniques like OCR or
|
|
37
39
|
inception. If that's the case, return the original image for this paragraph.
|
|
38
40
|
|
|
@@ -66,7 +68,7 @@ async def paragraph_source_image(
|
|
|
66
68
|
|
|
67
69
|
async def download_image(
|
|
68
70
|
kbid: str, field_id: FieldId, image_path: str, *, mime_type: str
|
|
69
|
-
) ->
|
|
71
|
+
) -> Image | None:
|
|
70
72
|
storage = await get_storage(service_name=SERVICE_NAME)
|
|
71
73
|
sf = storage.file_extracted(
|
|
72
74
|
kbid,
|
|
@@ -81,7 +83,7 @@ async def download_image(
|
|
|
81
83
|
return Image(content_type=mime_type, b64encoded=base64.b64encode(raw_image).decode())
|
|
82
84
|
|
|
83
85
|
|
|
84
|
-
async def download_page_preview(field: Field, page: int) ->
|
|
86
|
+
async def download_page_preview(field: Field, page: int) -> Image | None:
|
|
85
87
|
"""Download a specific page preview for a field and return it as an Image.
|
|
86
88
|
As not all fields have previews, this function can return None.
|
|
87
89
|
|
|
@@ -123,8 +125,6 @@ async def download_page_preview(field: Field, page: int) -> Optional[Image]:
|
|
|
123
125
|
image = None
|
|
124
126
|
|
|
125
127
|
else: # pragma: no cover
|
|
126
|
-
|
|
127
|
-
# that is, if we are missing some ifs
|
|
128
|
-
_a: int = "a"
|
|
128
|
+
assert_never(field_type)
|
|
129
129
|
|
|
130
130
|
return image
|
|
@@ -19,12 +19,11 @@
|
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
21
|
from dataclasses import dataclass
|
|
22
|
-
from typing import Optional, Union
|
|
23
22
|
|
|
24
23
|
from nucliadb.common.ids import FieldId, ParagraphId
|
|
25
24
|
from nucliadb.ingest.fields.base import Field
|
|
26
25
|
from nucliadb.ingest.orm.resource import Resource
|
|
27
|
-
from nucliadb.search.
|
|
26
|
+
from nucliadb.search.augmentor.paragraphs import get_paragraph_text
|
|
28
27
|
from nucliadb.search.search.hydrator.fields import page_preview_id
|
|
29
28
|
from nucliadb.search.search.hydrator.images import paragraph_source_image
|
|
30
29
|
from nucliadb_models import hydration as hydration_models
|
|
@@ -112,19 +111,19 @@ class ParagraphIndex:
|
|
|
112
111
|
replacement for replacement in paragraph.relations.replacements
|
|
113
112
|
]
|
|
114
113
|
|
|
115
|
-
def get(self, paragraph_id:
|
|
114
|
+
def get(self, paragraph_id: str | ParagraphId) -> resources_pb2.Paragraph | None:
|
|
116
115
|
paragraph_id = str(paragraph_id)
|
|
117
116
|
return self.paragraphs.get(paragraph_id)
|
|
118
117
|
|
|
119
|
-
def previous(self, paragraph_id:
|
|
118
|
+
def previous(self, paragraph_id: str | ParagraphId) -> str | None:
|
|
120
119
|
paragraph_id = str(paragraph_id)
|
|
121
120
|
return self.neighbours.get((paragraph_id, ParagraphIndex.PREVIOUS))
|
|
122
121
|
|
|
123
|
-
def next(self, paragraph_id:
|
|
122
|
+
def next(self, paragraph_id: str | ParagraphId) -> str | None:
|
|
124
123
|
paragraph_id = str(paragraph_id)
|
|
125
124
|
return self.neighbours.get((paragraph_id, ParagraphIndex.NEXT))
|
|
126
125
|
|
|
127
|
-
def n_previous(self, paragraph_id:
|
|
126
|
+
def n_previous(self, paragraph_id: str | ParagraphId, count: int = 1) -> list[str]:
|
|
128
127
|
assert count >= 1, f"can't find negative previous {count}"
|
|
129
128
|
paragraph_id = str(paragraph_id)
|
|
130
129
|
previous: list[str] = []
|
|
@@ -138,7 +137,7 @@ class ParagraphIndex:
|
|
|
138
137
|
current_id = previous_id
|
|
139
138
|
return previous
|
|
140
139
|
|
|
141
|
-
def n_next(self, paragraph_id:
|
|
140
|
+
def n_next(self, paragraph_id: str | ParagraphId, count: int = 1) -> list[str]:
|
|
142
141
|
assert count >= 1, f"can't find negative nexts {count}"
|
|
143
142
|
paragraph_id = str(paragraph_id)
|
|
144
143
|
nexts = []
|
|
@@ -152,23 +151,23 @@ class ParagraphIndex:
|
|
|
152
151
|
nexts.append(next_id)
|
|
153
152
|
return nexts
|
|
154
153
|
|
|
155
|
-
def parents(self, paragraph_id:
|
|
154
|
+
def parents(self, paragraph_id: str | ParagraphId) -> list[str]:
|
|
156
155
|
paragraph_id = str(paragraph_id)
|
|
157
156
|
return self.related.get((paragraph_id, ParagraphIndex.PARENTS), [])
|
|
158
157
|
|
|
159
|
-
def siblings(self, paragraph_id:
|
|
158
|
+
def siblings(self, paragraph_id: str | ParagraphId) -> list[str]:
|
|
160
159
|
paragraph_id = str(paragraph_id)
|
|
161
160
|
return self.related.get((paragraph_id, ParagraphIndex.SIBLINGS), [])
|
|
162
161
|
|
|
163
|
-
def replacements(self, paragraph_id:
|
|
162
|
+
def replacements(self, paragraph_id: str | ParagraphId) -> list[str]:
|
|
164
163
|
paragraph_id = str(paragraph_id)
|
|
165
164
|
return self.related.get((paragraph_id, ParagraphIndex.REPLACEMENTS), [])
|
|
166
165
|
|
|
167
166
|
|
|
168
167
|
@dataclass
|
|
169
168
|
class ExtraParagraphHydration:
|
|
170
|
-
field_page:
|
|
171
|
-
field_table_page:
|
|
169
|
+
field_page: int | None
|
|
170
|
+
field_table_page: int | None
|
|
172
171
|
related_paragraph_ids: list[ParagraphId]
|
|
173
172
|
|
|
174
173
|
|
|
@@ -187,7 +186,7 @@ async def hydrate_paragraph(
|
|
|
187
186
|
include more or less text than the originally extracted.
|
|
188
187
|
|
|
189
188
|
"""
|
|
190
|
-
kbid = resource.
|
|
189
|
+
kbid = resource.kbid
|
|
191
190
|
|
|
192
191
|
hydrated = hydration_models.HydratedParagraph(
|
|
193
192
|
id=paragraph_id.full(),
|
|
@@ -199,7 +198,7 @@ async def hydrate_paragraph(
|
|
|
199
198
|
)
|
|
200
199
|
|
|
201
200
|
if config.text:
|
|
202
|
-
text = await
|
|
201
|
+
text = await get_paragraph_text(field, paragraph_id)
|
|
203
202
|
hydrated.text = text
|
|
204
203
|
|
|
205
204
|
requires_paragraph_metadata = config.image or config.table or config.page or config.related
|
|
@@ -210,8 +209,20 @@ async def hydrate_paragraph(
|
|
|
210
209
|
# otherwise, this is a fake paragraph. We can't hydrate anything else here
|
|
211
210
|
|
|
212
211
|
if config.related:
|
|
212
|
+
if config.related.neighbours is not None:
|
|
213
|
+
before = config.related.neighbours.before
|
|
214
|
+
after = config.related.neighbours.after
|
|
215
|
+
else:
|
|
216
|
+
before, after = None, None
|
|
217
|
+
|
|
213
218
|
hydrated.related, related_ids = await related_paragraphs_refs(
|
|
214
|
-
paragraph_id,
|
|
219
|
+
paragraph_id,
|
|
220
|
+
field_paragraphs_index,
|
|
221
|
+
neighbours_before=before,
|
|
222
|
+
neighbours_after=after,
|
|
223
|
+
parents=config.related.parents or False,
|
|
224
|
+
siblings=config.related.siblings or False,
|
|
225
|
+
replacements=config.related.replacements or False,
|
|
215
226
|
)
|
|
216
227
|
extra_hydration.related_paragraph_ids = related_ids
|
|
217
228
|
|
|
@@ -259,7 +270,12 @@ async def hydrate_paragraph(
|
|
|
259
270
|
async def related_paragraphs_refs(
|
|
260
271
|
paragraph_id: ParagraphId,
|
|
261
272
|
index: ParagraphIndex,
|
|
262
|
-
|
|
273
|
+
*,
|
|
274
|
+
neighbours_before: int | None = None,
|
|
275
|
+
neighbours_after: int | None = None,
|
|
276
|
+
parents: bool = False,
|
|
277
|
+
siblings: bool = False,
|
|
278
|
+
replacements: bool = False,
|
|
263
279
|
) -> tuple[hydration_models.RelatedParagraphRefs, list[ParagraphId]]:
|
|
264
280
|
"""Compute the related paragraph references for a specific `paragraph_id`
|
|
265
281
|
and return them with the plain list of unique related paragraphs (to
|
|
@@ -269,36 +285,36 @@ async def related_paragraphs_refs(
|
|
|
269
285
|
hydrated = hydration_models.RelatedParagraphRefs()
|
|
270
286
|
related = set()
|
|
271
287
|
|
|
272
|
-
if
|
|
288
|
+
if neighbours_before or neighbours_after:
|
|
273
289
|
hydrated.neighbours = hydration_models.RelatedNeighbourParagraphRefs()
|
|
274
290
|
|
|
275
|
-
if
|
|
291
|
+
if neighbours_before is not None:
|
|
276
292
|
hydrated.neighbours.before = []
|
|
277
|
-
if
|
|
278
|
-
for previous_id in index.n_previous(paragraph_id,
|
|
293
|
+
if neighbours_before > 0:
|
|
294
|
+
for previous_id in index.n_previous(paragraph_id, neighbours_before):
|
|
279
295
|
hydrated.neighbours.before.insert(0, previous_id)
|
|
280
296
|
related.add(ParagraphId.from_string(previous_id))
|
|
281
297
|
|
|
282
|
-
if
|
|
298
|
+
if neighbours_after is not None:
|
|
283
299
|
hydrated.neighbours.after = []
|
|
284
|
-
if
|
|
285
|
-
for next_id in index.n_next(paragraph_id,
|
|
300
|
+
if neighbours_after > 0:
|
|
301
|
+
for next_id in index.n_next(paragraph_id, neighbours_after):
|
|
286
302
|
hydrated.neighbours.after.append(next_id)
|
|
287
303
|
related.add(ParagraphId.from_string(next_id))
|
|
288
304
|
|
|
289
|
-
if
|
|
305
|
+
if parents:
|
|
290
306
|
hydrated.parents = []
|
|
291
307
|
for parent_id in index.parents(paragraph_id):
|
|
292
308
|
hydrated.parents.append(parent_id)
|
|
293
309
|
related.add(ParagraphId.from_string(parent_id))
|
|
294
310
|
|
|
295
|
-
if
|
|
311
|
+
if siblings:
|
|
296
312
|
hydrated.siblings = []
|
|
297
313
|
for sibling_id in index.siblings(paragraph_id):
|
|
298
314
|
hydrated.siblings.append(sibling_id)
|
|
299
315
|
related.add(ParagraphId.from_string(sibling_id))
|
|
300
316
|
|
|
301
|
-
if
|
|
317
|
+
if replacements:
|
|
302
318
|
hydrated.replacements = []
|
|
303
319
|
for replacement_id in index.replacements(paragraph_id):
|
|
304
320
|
hydrated.replacements.append(replacement_id)
|