nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
-
#
|
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
-
#
|
|
6
|
-
# AGPL:
|
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
-
# License, or (at your option) any later version.
|
|
11
|
-
#
|
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
-
# GNU Affero General Public License for more details.
|
|
16
|
-
#
|
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
-
#
|
|
20
|
-
|
|
21
|
-
from typing import cast
|
|
22
|
-
|
|
23
|
-
from nidx_protos.noderesources_pb2 import Resource as IndexMessage
|
|
24
|
-
|
|
25
|
-
from nucliadb.common.maindb.driver import Transaction
|
|
26
|
-
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
|
27
|
-
from nucliadb.common.maindb.utils import get_driver
|
|
28
|
-
from nucliadb_telemetry import metrics
|
|
29
|
-
|
|
30
|
-
from ..resource import Resource
|
|
31
|
-
|
|
32
|
-
observer = metrics.Observer("pg_catalog_write", labels={"type": ""})
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def _pg_transaction(txn: Transaction) -> PGTransaction:
|
|
36
|
-
return cast(PGTransaction, txn)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def pgcatalog_enabled(kbid):
|
|
40
|
-
return isinstance(get_driver(), PGDriver)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def extract_facets(labels):
|
|
44
|
-
facets = set()
|
|
45
|
-
for label in labels:
|
|
46
|
-
parts = label.split("/")
|
|
47
|
-
facet = ""
|
|
48
|
-
for part in parts[1:]:
|
|
49
|
-
facet += f"/{part}"
|
|
50
|
-
facets.add(facet)
|
|
51
|
-
return facets
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@observer.wrap({"type": "update"})
|
|
55
|
-
async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
|
|
56
|
-
if not pgcatalog_enabled(kbid):
|
|
57
|
-
return
|
|
58
|
-
|
|
59
|
-
if resource.basic is None:
|
|
60
|
-
raise ValueError("Cannot index into the catalog a resource without basic metadata ")
|
|
61
|
-
|
|
62
|
-
created_at = resource.basic.created.ToDatetime()
|
|
63
|
-
modified_at = resource.basic.modified.ToDatetime()
|
|
64
|
-
if modified_at < created_at:
|
|
65
|
-
modified_at = created_at
|
|
66
|
-
|
|
67
|
-
async with _pg_transaction(txn).connection.cursor() as cur:
|
|
68
|
-
# Do not index canceled labels
|
|
69
|
-
cancelled_labels = {
|
|
70
|
-
f"/l/{clf.labelset}/{clf.label}"
|
|
71
|
-
for clf in resource.basic.usermetadata.classifications
|
|
72
|
-
if clf.cancelled_by_user
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
# Labels from the resource and classification labels from each field
|
|
76
|
-
labels = [label for label in index_message.labels]
|
|
77
|
-
for classification in resource.basic.computedmetadata.field_classifications:
|
|
78
|
-
for clf in classification.classifications:
|
|
79
|
-
label = f"/l/{clf.labelset}/{clf.label}"
|
|
80
|
-
if label not in cancelled_labels:
|
|
81
|
-
labels.append(label)
|
|
82
|
-
|
|
83
|
-
await cur.execute(
|
|
84
|
-
"""
|
|
85
|
-
INSERT INTO catalog
|
|
86
|
-
(kbid, rid, title, created_at, modified_at, labels, slug)
|
|
87
|
-
VALUES
|
|
88
|
-
(%(kbid)s, %(rid)s, %(title)s, %(created_at)s, %(modified_at)s, %(labels)s, %(slug)s)
|
|
89
|
-
ON CONFLICT (kbid, rid) DO UPDATE SET
|
|
90
|
-
title = excluded.title,
|
|
91
|
-
created_at = excluded.created_at,
|
|
92
|
-
modified_at = excluded.modified_at,
|
|
93
|
-
labels = excluded.labels,
|
|
94
|
-
slug = excluded.slug""",
|
|
95
|
-
{
|
|
96
|
-
"kbid": resource.kb.kbid,
|
|
97
|
-
"rid": resource.uuid,
|
|
98
|
-
"title": resource.basic.title,
|
|
99
|
-
"created_at": created_at,
|
|
100
|
-
"modified_at": modified_at,
|
|
101
|
-
"labels": labels,
|
|
102
|
-
"slug": resource.basic.slug,
|
|
103
|
-
},
|
|
104
|
-
)
|
|
105
|
-
await cur.execute(
|
|
106
|
-
"DELETE FROM catalog_facets WHERE kbid = %(kbid)s AND rid = %(rid)s",
|
|
107
|
-
{
|
|
108
|
-
"kbid": resource.kb.kbid,
|
|
109
|
-
"rid": resource.uuid,
|
|
110
|
-
},
|
|
111
|
-
)
|
|
112
|
-
await cur.execute(
|
|
113
|
-
"INSERT INTO catalog_facets (kbid, rid, facet) SELECT %(kbid)s AS kbid, %(rid)s AS rid, unnest(%(facets)s::text[]) AS facet",
|
|
114
|
-
{
|
|
115
|
-
"kbid": resource.kb.kbid,
|
|
116
|
-
"rid": resource.uuid,
|
|
117
|
-
"facets": list(extract_facets(labels)),
|
|
118
|
-
},
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
@observer.wrap({"type": "delete"})
|
|
123
|
-
async def pgcatalog_delete(txn: Transaction, kbid: str, rid: str):
|
|
124
|
-
if not pgcatalog_enabled(kbid):
|
|
125
|
-
return
|
|
126
|
-
async with _pg_transaction(txn).connection.cursor() as cur:
|
|
127
|
-
await cur.execute(
|
|
128
|
-
"DELETE FROM catalog where kbid = %(kbid)s AND rid = %(rid)s", {"kbid": kbid, "rid": rid}
|
|
129
|
-
)
|
|
@@ -1,197 +0,0 @@
|
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
-
#
|
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
-
#
|
|
6
|
-
# AGPL:
|
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
-
# License, or (at your option) any later version.
|
|
11
|
-
#
|
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
-
# GNU Affero General Public License for more details.
|
|
16
|
-
#
|
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
-
#
|
|
20
|
-
import asyncio
|
|
21
|
-
import logging
|
|
22
|
-
from contextlib import AsyncExitStack
|
|
23
|
-
from typing import Optional
|
|
24
|
-
|
|
25
|
-
from pydantic import BaseModel
|
|
26
|
-
|
|
27
|
-
from nucliadb.common.external_index_providers.base import TextBlockMatch
|
|
28
|
-
from nucliadb.common.ids import FieldId
|
|
29
|
-
from nucliadb.common.maindb.utils import get_driver
|
|
30
|
-
from nucliadb.ingest.serialize import managed_serialize
|
|
31
|
-
from nucliadb.search.search import cache, paragraphs
|
|
32
|
-
from nucliadb_models.common import FieldTypeName
|
|
33
|
-
from nucliadb_models.resource import ExtractedDataTypeName, Resource
|
|
34
|
-
from nucliadb_models.search import (
|
|
35
|
-
FindParagraph,
|
|
36
|
-
ResourceProperties,
|
|
37
|
-
)
|
|
38
|
-
from nucliadb_telemetry.metrics import Observer
|
|
39
|
-
from nucliadb_utils import const
|
|
40
|
-
from nucliadb_utils.asyncio_utils import ConcurrentRunner
|
|
41
|
-
from nucliadb_utils.utilities import has_feature
|
|
42
|
-
|
|
43
|
-
logger = logging.getLogger(__name__)
|
|
44
|
-
|
|
45
|
-
hydrator_observer = Observer("hydrator", labels={"type": ""})
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class ResourceHydrationOptions(BaseModel):
|
|
49
|
-
"""
|
|
50
|
-
Options for hydrating resources.
|
|
51
|
-
"""
|
|
52
|
-
|
|
53
|
-
show: list[ResourceProperties] = []
|
|
54
|
-
extracted: list[ExtractedDataTypeName] = []
|
|
55
|
-
field_type_filter: list[FieldTypeName] = []
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
class TextBlockHydrationOptions(BaseModel):
|
|
59
|
-
"""
|
|
60
|
-
Options for hydrating text blocks (aka paragraphs).
|
|
61
|
-
"""
|
|
62
|
-
|
|
63
|
-
# whether to highlight the text block with `<mark>...</mark>` tags or not
|
|
64
|
-
highlight: bool = False
|
|
65
|
-
|
|
66
|
-
# list of exact matches to highlight
|
|
67
|
-
ematches: Optional[list[str]] = None
|
|
68
|
-
|
|
69
|
-
# If true, only hydrate the text block if its text is not already populated
|
|
70
|
-
only_hydrate_empty: bool = False
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@hydrator_observer.wrap({"type": "resource_text"})
|
|
74
|
-
async def hydrate_resource_text(
|
|
75
|
-
kbid: str, rid: str, *, max_concurrent_tasks: int
|
|
76
|
-
) -> list[tuple[FieldId, str]]:
|
|
77
|
-
resource = await cache.get_resource(kbid, rid)
|
|
78
|
-
if resource is None: # pragma: no cover
|
|
79
|
-
return []
|
|
80
|
-
|
|
81
|
-
# Schedule the extraction of the text of each field in the resource
|
|
82
|
-
async with get_driver().ro_transaction() as txn:
|
|
83
|
-
resource.txn = txn
|
|
84
|
-
runner = ConcurrentRunner(max_tasks=max_concurrent_tasks)
|
|
85
|
-
for field_type, field_key in await resource.get_fields(force=True):
|
|
86
|
-
field_id = FieldId.from_pb(rid, field_type, field_key)
|
|
87
|
-
runner.schedule(hydrate_field_text(kbid, field_id))
|
|
88
|
-
|
|
89
|
-
# Include the summary aswell
|
|
90
|
-
runner.schedule(hydrate_field_text(kbid, FieldId(rid=rid, type="a", key="summary")))
|
|
91
|
-
|
|
92
|
-
# Wait for the results
|
|
93
|
-
field_extracted_texts = await runner.wait()
|
|
94
|
-
|
|
95
|
-
return [text for text in field_extracted_texts if text is not None]
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
@hydrator_observer.wrap({"type": "resource_metadata"})
|
|
99
|
-
async def hydrate_resource_metadata(
|
|
100
|
-
kbid: str,
|
|
101
|
-
resource_id: str,
|
|
102
|
-
options: ResourceHydrationOptions,
|
|
103
|
-
*,
|
|
104
|
-
concurrency_control: Optional[asyncio.Semaphore] = None,
|
|
105
|
-
service_name: Optional[str] = None,
|
|
106
|
-
) -> Optional[Resource]:
|
|
107
|
-
"""Fetch resource metadata and return it serialized."""
|
|
108
|
-
show = options.show
|
|
109
|
-
extracted = options.extracted
|
|
110
|
-
|
|
111
|
-
if ResourceProperties.EXTRACTED in show and has_feature(
|
|
112
|
-
const.Features.IGNORE_EXTRACTED_IN_SEARCH, context={"kbid": kbid}, default=False
|
|
113
|
-
):
|
|
114
|
-
# Returning extracted metadata in search results is deprecated and this flag
|
|
115
|
-
# will be set to True for all KBs in the future.
|
|
116
|
-
show.remove(ResourceProperties.EXTRACTED)
|
|
117
|
-
extracted = []
|
|
118
|
-
|
|
119
|
-
async with AsyncExitStack() as stack:
|
|
120
|
-
if concurrency_control is not None:
|
|
121
|
-
await stack.enter_async_context(concurrency_control)
|
|
122
|
-
|
|
123
|
-
async with get_driver().ro_transaction() as ro_txn:
|
|
124
|
-
serialized_resource = await managed_serialize(
|
|
125
|
-
txn=ro_txn,
|
|
126
|
-
kbid=kbid,
|
|
127
|
-
rid=resource_id,
|
|
128
|
-
show=show,
|
|
129
|
-
field_type_filter=options.field_type_filter,
|
|
130
|
-
extracted=extracted,
|
|
131
|
-
service_name=service_name,
|
|
132
|
-
)
|
|
133
|
-
if serialized_resource is None:
|
|
134
|
-
logger.warning(
|
|
135
|
-
"Resource not found in database", extra={"kbid": kbid, "rid": resource_id}
|
|
136
|
-
)
|
|
137
|
-
return serialized_resource
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
@hydrator_observer.wrap({"type": "field_text"})
|
|
141
|
-
async def hydrate_field_text(
|
|
142
|
-
kbid: str,
|
|
143
|
-
field_id: FieldId,
|
|
144
|
-
) -> Optional[tuple[FieldId, str]]:
|
|
145
|
-
extracted_text_pb = await cache.get_extracted_text_from_field_id(kbid, field_id)
|
|
146
|
-
if extracted_text_pb is None: # pragma: no cover
|
|
147
|
-
return None
|
|
148
|
-
|
|
149
|
-
if field_id.subfield_id:
|
|
150
|
-
return field_id, extracted_text_pb.split_text[field_id.subfield_id]
|
|
151
|
-
else:
|
|
152
|
-
return field_id, extracted_text_pb.text
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
@hydrator_observer.wrap({"type": "text_block"})
|
|
156
|
-
async def hydrate_text_block(
|
|
157
|
-
kbid: str,
|
|
158
|
-
text_block: TextBlockMatch,
|
|
159
|
-
options: TextBlockHydrationOptions,
|
|
160
|
-
*,
|
|
161
|
-
concurrency_control: Optional[asyncio.Semaphore] = None,
|
|
162
|
-
) -> TextBlockMatch:
|
|
163
|
-
"""Given a `text_block`, fetch its corresponding text, modify and return the
|
|
164
|
-
`text_block` object.
|
|
165
|
-
|
|
166
|
-
"""
|
|
167
|
-
if options.only_hydrate_empty and text_block.text:
|
|
168
|
-
return text_block
|
|
169
|
-
async with AsyncExitStack() as stack:
|
|
170
|
-
if concurrency_control is not None:
|
|
171
|
-
await stack.enter_async_context(concurrency_control)
|
|
172
|
-
|
|
173
|
-
text_block.text = await paragraphs.get_paragraph_text(
|
|
174
|
-
kbid=kbid,
|
|
175
|
-
paragraph_id=text_block.paragraph_id,
|
|
176
|
-
highlight=options.highlight,
|
|
177
|
-
matches=[], # TODO: this was never implemented
|
|
178
|
-
ematches=options.ematches,
|
|
179
|
-
)
|
|
180
|
-
return text_block
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
def text_block_to_find_paragraph(text_block: TextBlockMatch) -> FindParagraph:
|
|
184
|
-
return FindParagraph(
|
|
185
|
-
id=text_block.paragraph_id.full(),
|
|
186
|
-
text=text_block.text or "",
|
|
187
|
-
score=text_block.score,
|
|
188
|
-
score_type=text_block.score_type,
|
|
189
|
-
order=text_block.order,
|
|
190
|
-
labels=text_block.paragraph_labels,
|
|
191
|
-
fuzzy_result=text_block.fuzzy_search,
|
|
192
|
-
is_a_table=text_block.is_a_table,
|
|
193
|
-
reference=text_block.representation_file,
|
|
194
|
-
page_with_visual=text_block.page_with_visual,
|
|
195
|
-
position=text_block.position,
|
|
196
|
-
relevant_relations=text_block.relevant_relations,
|
|
197
|
-
)
|