nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -18,33 +18,54 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
|
-
from
|
|
21
|
+
from collections.abc import AsyncGenerator, Iterable
|
|
22
|
+
from time import time
|
|
22
23
|
|
|
23
|
-
from nidx_protos.nodereader_pb2 import
|
|
24
|
-
|
|
25
|
-
SearchResponse,
|
|
26
|
-
)
|
|
24
|
+
from nidx_protos.nodereader_pb2 import GraphSearchResponse, SearchResponse
|
|
25
|
+
from nuclia_models.predict.generative_responses import GenerativeChunk
|
|
27
26
|
|
|
27
|
+
from nucliadb.common.external_index_providers.base import TextBlockMatch
|
|
28
|
+
from nucliadb.common.ids import ParagraphId
|
|
28
29
|
from nucliadb.common.models_utils import to_proto
|
|
29
30
|
from nucliadb.search import logger
|
|
30
31
|
from nucliadb.search.predict import AnswerStatusCode, RephraseResponse
|
|
31
32
|
from nucliadb.search.requesters.utils import Method, nidx_query
|
|
33
|
+
from nucliadb.search.search.chat import rpc
|
|
32
34
|
from nucliadb.search.search.chat.exceptions import NoRetrievalResultsError
|
|
35
|
+
from nucliadb.search.search.chat.parser import rao_parse_find
|
|
33
36
|
from nucliadb.search.search.exceptions import IncompleteFindResultsError
|
|
34
37
|
from nucliadb.search.search.find import find
|
|
38
|
+
from nucliadb.search.search.find_merge import text_block_to_find_paragraph
|
|
39
|
+
from nucliadb.search.search.hydrator import ResourceHydrationOptions, TextBlockHydrationOptions
|
|
35
40
|
from nucliadb.search.search.merge import merge_relations_results
|
|
36
41
|
from nucliadb.search.search.metrics import Metrics
|
|
37
|
-
from nucliadb.search.search.
|
|
42
|
+
from nucliadb.search.search.paragraphs import highlight_paragraph
|
|
43
|
+
from nucliadb.search.search.query_parser.fetcher import Fetcher
|
|
44
|
+
from nucliadb.search.search.query_parser.models import Query, RelationQuery, UnitRetrieval
|
|
38
45
|
from nucliadb.search.search.query_parser.parsers.unit_retrieval import convert_retrieval_to_proto
|
|
46
|
+
from nucliadb.search.search.rerankers import RerankableItem, Reranker, RerankingOptions, get_reranker
|
|
39
47
|
from nucliadb.search.settings import settings
|
|
40
48
|
from nucliadb.search.utilities import get_predict
|
|
41
49
|
from nucliadb_models import filters
|
|
50
|
+
from nucliadb_models.augment import (
|
|
51
|
+
AugmentedResource,
|
|
52
|
+
AugmentParagraph,
|
|
53
|
+
AugmentParagraphs,
|
|
54
|
+
AugmentRequest,
|
|
55
|
+
AugmentResources,
|
|
56
|
+
ParagraphMetadata,
|
|
57
|
+
)
|
|
58
|
+
from nucliadb_models.retrieval import RerankerScore, RetrievalMatch, ScoreType
|
|
42
59
|
from nucliadb_models.search import (
|
|
60
|
+
SCORE_TYPE,
|
|
43
61
|
AskRequest,
|
|
44
62
|
ChatContextMessage,
|
|
63
|
+
ChatModel,
|
|
45
64
|
ChatOptions,
|
|
65
|
+
FindField,
|
|
46
66
|
FindOptions,
|
|
47
67
|
FindRequest,
|
|
68
|
+
FindResource,
|
|
48
69
|
KnowledgeboxFindResults,
|
|
49
70
|
NucliaDBClientType,
|
|
50
71
|
PreQueriesStrategy,
|
|
@@ -54,12 +75,14 @@ from nucliadb_models.search import (
|
|
|
54
75
|
PromptContextOrder,
|
|
55
76
|
Relations,
|
|
56
77
|
RephraseModel,
|
|
78
|
+
TextPosition,
|
|
57
79
|
parse_rephrase_prompt,
|
|
58
80
|
)
|
|
59
81
|
from nucliadb_protos import audit_pb2
|
|
60
82
|
from nucliadb_protos.utils_pb2 import RelationNode
|
|
61
83
|
from nucliadb_telemetry.errors import capture_exception
|
|
62
|
-
from nucliadb_utils
|
|
84
|
+
from nucliadb_utils import const
|
|
85
|
+
from nucliadb_utils.utilities import get_audit, has_feature
|
|
63
86
|
|
|
64
87
|
NOT_ENOUGH_CONTEXT_ANSWER = "Not enough data to answer this."
|
|
65
88
|
|
|
@@ -70,9 +93,11 @@ async def rephrase_query(
|
|
|
70
93
|
query: str,
|
|
71
94
|
user_id: str,
|
|
72
95
|
user_context: list[str],
|
|
73
|
-
generative_model:
|
|
74
|
-
chat_history_relevance_threshold:
|
|
96
|
+
generative_model: str | None = None,
|
|
97
|
+
chat_history_relevance_threshold: float | None = None,
|
|
75
98
|
) -> RephraseResponse:
|
|
99
|
+
# NOTE: When moving /ask to RAO, this will need to change to whatever client/utility is used
|
|
100
|
+
# to call NUA predict (internally or externally in the case of onprem).
|
|
76
101
|
predict = get_predict()
|
|
77
102
|
req = RephraseModel(
|
|
78
103
|
question=query,
|
|
@@ -94,8 +119,8 @@ async def get_find_results(
|
|
|
94
119
|
user: str,
|
|
95
120
|
origin: str,
|
|
96
121
|
metrics: Metrics,
|
|
97
|
-
prequeries_strategy:
|
|
98
|
-
) -> tuple[KnowledgeboxFindResults,
|
|
122
|
+
prequeries_strategy: PreQueriesStrategy | None = None,
|
|
123
|
+
) -> tuple[KnowledgeboxFindResults, list[PreQueryResult] | None, Fetcher, Reranker]:
|
|
99
124
|
prequeries_results = None
|
|
100
125
|
prefilter_queries_results = None
|
|
101
126
|
queries_results = None
|
|
@@ -141,7 +166,7 @@ async def get_find_results(
|
|
|
141
166
|
prequeries_results = (prefilter_queries_results or []) + (queries_results or [])
|
|
142
167
|
|
|
143
168
|
with metrics.time("main_query"):
|
|
144
|
-
main_results,
|
|
169
|
+
main_results, fetcher, reranker = await run_main_query(
|
|
145
170
|
kbid,
|
|
146
171
|
query,
|
|
147
172
|
item,
|
|
@@ -150,10 +175,10 @@ async def get_find_results(
|
|
|
150
175
|
origin,
|
|
151
176
|
metrics=metrics.child_span("main_query"),
|
|
152
177
|
)
|
|
153
|
-
return main_results, prequeries_results,
|
|
178
|
+
return main_results, prequeries_results, fetcher, reranker
|
|
154
179
|
|
|
155
180
|
|
|
156
|
-
def add_resource_filter(request:
|
|
181
|
+
def add_resource_filter(request: FindRequest | AskRequest, resources: list[str]):
|
|
157
182
|
if len(resources) == 0:
|
|
158
183
|
return
|
|
159
184
|
|
|
@@ -200,7 +225,6 @@ def find_request_from_ask_request(item: AskRequest, query: str) -> FindRequest:
|
|
|
200
225
|
find_request.range_modification_end = item.range_modification_end
|
|
201
226
|
find_request.show = item.show
|
|
202
227
|
find_request.extracted = item.extracted
|
|
203
|
-
find_request.autofilter = item.autofilter
|
|
204
228
|
find_request.highlight = item.highlight
|
|
205
229
|
find_request.security = item.security
|
|
206
230
|
find_request.debug = item.debug
|
|
@@ -226,10 +250,10 @@ async def run_main_query(
|
|
|
226
250
|
user: str,
|
|
227
251
|
origin: str,
|
|
228
252
|
metrics: Metrics,
|
|
229
|
-
) -> tuple[KnowledgeboxFindResults,
|
|
253
|
+
) -> tuple[KnowledgeboxFindResults, Fetcher, Reranker]:
|
|
230
254
|
find_request = find_request_from_ask_request(item, query)
|
|
231
255
|
|
|
232
|
-
find_results, incomplete,
|
|
256
|
+
find_results, incomplete, fetcher, reranker = await find_retrieval(
|
|
233
257
|
kbid,
|
|
234
258
|
find_request,
|
|
235
259
|
ndb_client,
|
|
@@ -239,14 +263,14 @@ async def run_main_query(
|
|
|
239
263
|
)
|
|
240
264
|
if incomplete:
|
|
241
265
|
raise IncompleteFindResultsError()
|
|
242
|
-
return find_results,
|
|
266
|
+
return find_results, fetcher, reranker
|
|
243
267
|
|
|
244
268
|
|
|
245
269
|
async def get_relations_results(
|
|
246
270
|
*,
|
|
247
271
|
kbid: str,
|
|
248
272
|
text_answer: str,
|
|
249
|
-
timeout:
|
|
273
|
+
timeout: float | None = None,
|
|
250
274
|
) -> Relations:
|
|
251
275
|
try:
|
|
252
276
|
predict = get_predict()
|
|
@@ -267,7 +291,7 @@ async def get_relations_results_from_entities(
|
|
|
267
291
|
*,
|
|
268
292
|
kbid: str,
|
|
269
293
|
entities: Iterable[RelationNode],
|
|
270
|
-
timeout:
|
|
294
|
+
timeout: float | None = None,
|
|
271
295
|
deleted_entities: set[str] = set(),
|
|
272
296
|
) -> Relations:
|
|
273
297
|
entry_points = list(entities)
|
|
@@ -308,17 +332,19 @@ def maybe_audit_chat(
|
|
|
308
332
|
origin: str,
|
|
309
333
|
generative_answer_time: float,
|
|
310
334
|
generative_answer_first_chunk_time: float,
|
|
311
|
-
|
|
335
|
+
generative_reasoning_first_chunk_time: float | None,
|
|
336
|
+
rephrase_time: float | None,
|
|
312
337
|
user_query: str,
|
|
313
|
-
rephrased_query:
|
|
314
|
-
retrieval_rephrase_query:
|
|
338
|
+
rephrased_query: str | None,
|
|
339
|
+
retrieval_rephrase_query: str | None,
|
|
315
340
|
text_answer: bytes,
|
|
341
|
+
text_reasoning: str | None,
|
|
316
342
|
status_code: AnswerStatusCode,
|
|
317
343
|
chat_history: list[ChatContextMessage],
|
|
318
344
|
query_context: PromptContext,
|
|
319
345
|
query_context_order: PromptContextOrder,
|
|
320
|
-
learning_id:
|
|
321
|
-
model:
|
|
346
|
+
learning_id: str | None,
|
|
347
|
+
model: str | None,
|
|
322
348
|
):
|
|
323
349
|
audit = get_audit()
|
|
324
350
|
if audit is None:
|
|
@@ -344,19 +370,21 @@ def maybe_audit_chat(
|
|
|
344
370
|
question=user_query,
|
|
345
371
|
generative_answer_time=generative_answer_time,
|
|
346
372
|
generative_answer_first_chunk_time=generative_answer_first_chunk_time,
|
|
373
|
+
generative_reasoning_first_chunk_time=generative_reasoning_first_chunk_time,
|
|
347
374
|
rephrase_time=rephrase_time,
|
|
348
375
|
rephrased_question=rephrased_query,
|
|
349
376
|
retrieval_rephrased_question=retrieval_rephrase_query,
|
|
350
377
|
chat_context=chat_history_context,
|
|
351
378
|
retrieved_context=chat_retrieved_context,
|
|
352
379
|
answer=audit_answer,
|
|
380
|
+
reasoning=text_reasoning,
|
|
353
381
|
learning_id=learning_id,
|
|
354
382
|
status_code=int(status_code.value),
|
|
355
383
|
model=model,
|
|
356
384
|
)
|
|
357
385
|
|
|
358
386
|
|
|
359
|
-
def parse_audit_answer(raw_text_answer: bytes, status_code: AnswerStatusCode) ->
|
|
387
|
+
def parse_audit_answer(raw_text_answer: bytes, status_code: AnswerStatusCode) -> str | None:
|
|
360
388
|
if status_code == AnswerStatusCode.NO_CONTEXT or status_code == AnswerStatusCode.NO_RETRIEVAL_DATA:
|
|
361
389
|
# We don't want to audit "Not enough context to answer this." and instead set a None.
|
|
362
390
|
return None
|
|
@@ -377,13 +405,13 @@ class ChatAuditor:
|
|
|
377
405
|
client_type: NucliaDBClientType,
|
|
378
406
|
origin: str,
|
|
379
407
|
user_query: str,
|
|
380
|
-
rephrased_query:
|
|
381
|
-
retrieval_rephrased_query:
|
|
408
|
+
rephrased_query: str | None,
|
|
409
|
+
retrieval_rephrased_query: str | None,
|
|
382
410
|
chat_history: list[ChatContextMessage],
|
|
383
|
-
learning_id:
|
|
411
|
+
learning_id: str | None,
|
|
384
412
|
query_context: PromptContext,
|
|
385
413
|
query_context_order: PromptContextOrder,
|
|
386
|
-
model:
|
|
414
|
+
model: str | None,
|
|
387
415
|
):
|
|
388
416
|
self.kbid = kbid
|
|
389
417
|
self.user_id = user_id
|
|
@@ -401,9 +429,11 @@ class ChatAuditor:
|
|
|
401
429
|
def audit(
|
|
402
430
|
self,
|
|
403
431
|
text_answer: bytes,
|
|
432
|
+
text_reasoning: str | None,
|
|
404
433
|
generative_answer_time: float,
|
|
405
434
|
generative_answer_first_chunk_time: float,
|
|
406
|
-
|
|
435
|
+
generative_reasoning_first_chunk_time: float | None,
|
|
436
|
+
rephrase_time: float | None,
|
|
407
437
|
status_code: AnswerStatusCode,
|
|
408
438
|
):
|
|
409
439
|
maybe_audit_chat(
|
|
@@ -416,8 +446,10 @@ class ChatAuditor:
|
|
|
416
446
|
retrieval_rephrase_query=self.retrieval_rephrased_query,
|
|
417
447
|
generative_answer_time=generative_answer_time,
|
|
418
448
|
generative_answer_first_chunk_time=generative_answer_first_chunk_time,
|
|
449
|
+
generative_reasoning_first_chunk_time=generative_reasoning_first_chunk_time,
|
|
419
450
|
rephrase_time=rephrase_time,
|
|
420
451
|
text_answer=text_answer,
|
|
452
|
+
text_reasoning=text_reasoning,
|
|
421
453
|
status_code=status_code,
|
|
422
454
|
chat_history=self.chat_history,
|
|
423
455
|
query_context=self.query_context,
|
|
@@ -457,7 +489,7 @@ async def run_prequeries(
|
|
|
457
489
|
async def _prequery_find(prequery: PreQuery, index: int):
|
|
458
490
|
async with max_parallel_prequeries:
|
|
459
491
|
prequery_id = prequery.id or f"prequery-{index}"
|
|
460
|
-
find_results, _, _ = await
|
|
492
|
+
find_results, _, _, _ = await find_retrieval(
|
|
461
493
|
kbid,
|
|
462
494
|
prequery.request,
|
|
463
495
|
x_ndb_client,
|
|
@@ -474,3 +506,392 @@ async def run_prequeries(
|
|
|
474
506
|
for prequery, find_results in ops_results:
|
|
475
507
|
results.append((prequery, find_results))
|
|
476
508
|
return results
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
async def get_answer_stream(
|
|
512
|
+
kbid: str,
|
|
513
|
+
item: ChatModel,
|
|
514
|
+
extra_headers: dict[str, str] | None = None,
|
|
515
|
+
) -> tuple[str, str, AsyncGenerator[GenerativeChunk, None]]:
|
|
516
|
+
# NOTE: When moving /ask to RAO, this will need to change to whatever client/utility is used
|
|
517
|
+
# to call NUA predict (internally or externally in the case of onprem).
|
|
518
|
+
predict = get_predict()
|
|
519
|
+
return await predict.chat_query_ndjson(
|
|
520
|
+
kbid=kbid,
|
|
521
|
+
item=item,
|
|
522
|
+
extra_headers=extra_headers,
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
async def find_retrieval(
|
|
527
|
+
kbid: str,
|
|
528
|
+
find_request: FindRequest,
|
|
529
|
+
x_ndb_client: NucliaDBClientType,
|
|
530
|
+
x_nucliadb_user: str,
|
|
531
|
+
x_forwarded_for: str,
|
|
532
|
+
metrics: Metrics,
|
|
533
|
+
) -> tuple[KnowledgeboxFindResults, bool, Fetcher, Reranker]:
|
|
534
|
+
if not has_feature(const.Features.ASK_DECOUPLED, context={"kbid": kbid}):
|
|
535
|
+
results, incomplete, parsed = await find(
|
|
536
|
+
kbid,
|
|
537
|
+
find_request,
|
|
538
|
+
x_ndb_client,
|
|
539
|
+
x_nucliadb_user,
|
|
540
|
+
x_forwarded_for,
|
|
541
|
+
metrics=metrics,
|
|
542
|
+
)
|
|
543
|
+
# this has already been asserted inside the find() call
|
|
544
|
+
assert parsed.retrieval.reranker is not None, "find parser must provide a reranking algorithm"
|
|
545
|
+
reranker = get_reranker(parsed.retrieval.reranker)
|
|
546
|
+
return results, incomplete, parsed.fetcher, reranker
|
|
547
|
+
|
|
548
|
+
return await rao_find(
|
|
549
|
+
kbid,
|
|
550
|
+
find_request,
|
|
551
|
+
x_ndb_client,
|
|
552
|
+
x_nucliadb_user,
|
|
553
|
+
x_forwarded_for,
|
|
554
|
+
metrics=metrics,
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
async def rao_find(
|
|
559
|
+
kbid: str,
|
|
560
|
+
find_request: FindRequest,
|
|
561
|
+
x_ndb_client: NucliaDBClientType,
|
|
562
|
+
x_nucliadb_user: str,
|
|
563
|
+
x_forwarded_for: str,
|
|
564
|
+
metrics: Metrics,
|
|
565
|
+
) -> tuple[KnowledgeboxFindResults, bool, Fetcher, Reranker]:
|
|
566
|
+
"""This is an equivalent implementation of /find but uses the new /retrieve
|
|
567
|
+
and /augment endpoints under the hood while providing bw/c for the /find
|
|
568
|
+
response model.
|
|
569
|
+
|
|
570
|
+
This implementation is provided to comply with the existing /find interface
|
|
571
|
+
to which /ask is tighly coupled with.
|
|
572
|
+
|
|
573
|
+
Note there's an edge case, when users ask for features=relations, in which
|
|
574
|
+
we fallback to /find, as it's the simplest way to provide bw/c.
|
|
575
|
+
|
|
576
|
+
"""
|
|
577
|
+
audit = get_audit()
|
|
578
|
+
start_time = time()
|
|
579
|
+
|
|
580
|
+
fetcher, retrieval_request, reranker = await rao_parse_find(kbid, find_request)
|
|
581
|
+
|
|
582
|
+
query = find_request.query
|
|
583
|
+
rephrased_query = None
|
|
584
|
+
if retrieval_request.query.keyword:
|
|
585
|
+
if find_request.query != retrieval_request.query.keyword.query:
|
|
586
|
+
rephrased_query = retrieval_request.query.keyword.query
|
|
587
|
+
|
|
588
|
+
retrieval_response = await rpc.retrieve(
|
|
589
|
+
kbid,
|
|
590
|
+
retrieval_request,
|
|
591
|
+
x_ndb_client=x_ndb_client,
|
|
592
|
+
x_nucliadb_user=x_nucliadb_user,
|
|
593
|
+
x_forwarded_for=x_forwarded_for,
|
|
594
|
+
)
|
|
595
|
+
matches = retrieval_response.matches
|
|
596
|
+
|
|
597
|
+
relations = None
|
|
598
|
+
if FindOptions.RELATIONS in find_request.features:
|
|
599
|
+
# the user asked for a legacy relations search, as we don't support it
|
|
600
|
+
# in the /retrieve endpoint but we must maintain bw/c with /find
|
|
601
|
+
# responses, we call it with to get just this part of the response
|
|
602
|
+
find_response, _ = await rpc.find(
|
|
603
|
+
kbid,
|
|
604
|
+
FindRequest(
|
|
605
|
+
features=[FindOptions.RELATIONS],
|
|
606
|
+
# needed for automatic entity detection
|
|
607
|
+
query=query,
|
|
608
|
+
# used for "hardcoded" graph queries
|
|
609
|
+
query_entities=find_request.query_entities,
|
|
610
|
+
),
|
|
611
|
+
x_ndb_client,
|
|
612
|
+
x_nucliadb_user,
|
|
613
|
+
x_forwarded_for,
|
|
614
|
+
metrics,
|
|
615
|
+
)
|
|
616
|
+
relations = find_response.relations
|
|
617
|
+
|
|
618
|
+
text_blocks, resources, best_matches = await augment_and_rerank(
|
|
619
|
+
kbid,
|
|
620
|
+
matches,
|
|
621
|
+
# here we use the original top_k, so we end up with the number of
|
|
622
|
+
# results requested by the user
|
|
623
|
+
top_k=find_request.top_k,
|
|
624
|
+
resource_hydration_options=ResourceHydrationOptions(
|
|
625
|
+
show=find_request.show,
|
|
626
|
+
extracted=find_request.extracted,
|
|
627
|
+
field_type_filter=find_request.field_type_filter,
|
|
628
|
+
),
|
|
629
|
+
text_block_hydration_options=TextBlockHydrationOptions(),
|
|
630
|
+
reranker=reranker,
|
|
631
|
+
reranking_options=RerankingOptions(kbid=kbid, query=rephrased_query or query),
|
|
632
|
+
)
|
|
633
|
+
find_resources = compose_find_resources(text_blocks, resources)
|
|
634
|
+
find_results = KnowledgeboxFindResults(
|
|
635
|
+
query=query,
|
|
636
|
+
rephrased_query=query,
|
|
637
|
+
resources=find_resources,
|
|
638
|
+
best_matches=best_matches,
|
|
639
|
+
relations=relations,
|
|
640
|
+
# legacy fields
|
|
641
|
+
total=len(text_blocks),
|
|
642
|
+
page_number=0,
|
|
643
|
+
page_size=find_request.top_k,
|
|
644
|
+
next_page=False,
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
# audit request
|
|
648
|
+
if audit is not None:
|
|
649
|
+
from nidx_protos.nodereader_pb2 import SearchRequest
|
|
650
|
+
|
|
651
|
+
search_time = time() - start_time
|
|
652
|
+
# TODO(decoupled-ask): implement audit.retrieve or something like that?
|
|
653
|
+
audit.search(
|
|
654
|
+
kbid,
|
|
655
|
+
x_nucliadb_user,
|
|
656
|
+
to_proto.client_type(x_ndb_client),
|
|
657
|
+
x_forwarded_for,
|
|
658
|
+
# TODO(decoupled-ask): we don't have this proto anymore
|
|
659
|
+
SearchRequest(),
|
|
660
|
+
search_time,
|
|
661
|
+
len(find_resources),
|
|
662
|
+
retrieval_rephrased_question=rephrased_query,
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
return find_results, False, fetcher, reranker
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
async def augment_and_rerank(
|
|
669
|
+
kbid: str,
|
|
670
|
+
matches: list[RetrievalMatch],
|
|
671
|
+
top_k: int,
|
|
672
|
+
resource_hydration_options: ResourceHydrationOptions,
|
|
673
|
+
text_block_hydration_options: TextBlockHydrationOptions,
|
|
674
|
+
reranker: Reranker,
|
|
675
|
+
reranking_options: RerankingOptions,
|
|
676
|
+
):
|
|
677
|
+
score_type_map = {
|
|
678
|
+
ScoreType.SEMANTIC: SCORE_TYPE.VECTOR,
|
|
679
|
+
ScoreType.KEYWORD: SCORE_TYPE.BM25,
|
|
680
|
+
ScoreType.RRF: SCORE_TYPE.BOTH,
|
|
681
|
+
ScoreType.DEFAULT_RERANKER: SCORE_TYPE.RERANKER,
|
|
682
|
+
ScoreType.GRAPH: SCORE_TYPE.RELATION_RELEVANCE,
|
|
683
|
+
}
|
|
684
|
+
text_blocks = []
|
|
685
|
+
for match in matches:
|
|
686
|
+
paragraph_id = ParagraphId.from_string(match.id)
|
|
687
|
+
score_type = score_type_map[match.score.type]
|
|
688
|
+
text_block = TextBlockMatch(
|
|
689
|
+
paragraph_id=paragraph_id,
|
|
690
|
+
scores=match.score.history,
|
|
691
|
+
score_type=score_type,
|
|
692
|
+
position=TextPosition(
|
|
693
|
+
page_number=match.metadata.page,
|
|
694
|
+
index=0,
|
|
695
|
+
start=paragraph_id.paragraph_start,
|
|
696
|
+
end=paragraph_id.paragraph_end,
|
|
697
|
+
start_seconds=[],
|
|
698
|
+
end_seconds=[],
|
|
699
|
+
),
|
|
700
|
+
order=-1, # will be populated later
|
|
701
|
+
fuzzy_search=False, # we don't have this info anymore
|
|
702
|
+
is_a_table=match.metadata.is_a_table,
|
|
703
|
+
representation_file=match.metadata.source_file,
|
|
704
|
+
field_labels=match.metadata.field_labels,
|
|
705
|
+
paragraph_labels=match.metadata.paragraph_labels,
|
|
706
|
+
)
|
|
707
|
+
text_blocks.append(text_block)
|
|
708
|
+
|
|
709
|
+
return await hydrate_and_rerank(
|
|
710
|
+
text_blocks,
|
|
711
|
+
kbid,
|
|
712
|
+
resource_hydration_options=resource_hydration_options,
|
|
713
|
+
text_block_hydration_options=text_block_hydration_options,
|
|
714
|
+
reranker=reranker,
|
|
715
|
+
reranking_options=reranking_options,
|
|
716
|
+
top_k=top_k,
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
async def hydrate_and_rerank(
|
|
721
|
+
text_blocks: Iterable[TextBlockMatch],
|
|
722
|
+
kbid: str,
|
|
723
|
+
*,
|
|
724
|
+
resource_hydration_options: ResourceHydrationOptions,
|
|
725
|
+
text_block_hydration_options: TextBlockHydrationOptions,
|
|
726
|
+
reranker: Reranker,
|
|
727
|
+
reranking_options: RerankingOptions,
|
|
728
|
+
top_k: int,
|
|
729
|
+
) -> tuple[list[TextBlockMatch], list[AugmentedResource], list[str]]:
|
|
730
|
+
"""Given a list of text blocks from a retrieval operation, hydrate and
|
|
731
|
+
rerank the results.
|
|
732
|
+
|
|
733
|
+
This function returns either the entire list or a subset of updated
|
|
734
|
+
(hydrated and reranked) text blocks and their corresponding resource
|
|
735
|
+
metadata. It also returns an ordered list of best matches.
|
|
736
|
+
|
|
737
|
+
"""
|
|
738
|
+
# Iterate text blocks to create an "index" for faster access by id and get a
|
|
739
|
+
# list of text block ids and resource ids to hydrate
|
|
740
|
+
text_blocks_by_id: dict[str, TextBlockMatch] = {} # useful for faster access to text blocks later
|
|
741
|
+
resources_to_hydrate = set()
|
|
742
|
+
text_block_id_to_hydrate = set()
|
|
743
|
+
|
|
744
|
+
for text_block in text_blocks:
|
|
745
|
+
rid = text_block.paragraph_id.rid
|
|
746
|
+
paragraph_id = text_block.paragraph_id.full()
|
|
747
|
+
|
|
748
|
+
# If we find multiple results (from different indexes) with different
|
|
749
|
+
# metadata, this statement will only get the metadata from the first on
|
|
750
|
+
# the list. We assume metadata is the same on all indexes, otherwise
|
|
751
|
+
# this would be a BUG
|
|
752
|
+
text_blocks_by_id.setdefault(paragraph_id, text_block)
|
|
753
|
+
|
|
754
|
+
# rerankers that need extra results may end with less resources than the
|
|
755
|
+
# ones we see now, so we'll skip this step and recompute the resources
|
|
756
|
+
# later
|
|
757
|
+
if not reranker.needs_extra_results:
|
|
758
|
+
resources_to_hydrate.add(rid)
|
|
759
|
+
|
|
760
|
+
if text_block_hydration_options.only_hydrate_empty and text_block.text:
|
|
761
|
+
pass
|
|
762
|
+
else:
|
|
763
|
+
text_block_id_to_hydrate.add(paragraph_id)
|
|
764
|
+
|
|
765
|
+
resource_augment = AugmentResources(
|
|
766
|
+
given=list(resources_to_hydrate),
|
|
767
|
+
field_type_filter=resource_hydration_options.field_type_filter,
|
|
768
|
+
)
|
|
769
|
+
resource_augment.apply_show_and_extracted(
|
|
770
|
+
resource_hydration_options.show,
|
|
771
|
+
resource_hydration_options.extracted,
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
# hydrate only the strictly needed before rerank
|
|
775
|
+
augment_request = AugmentRequest(
|
|
776
|
+
resources=[resource_augment],
|
|
777
|
+
paragraphs=[
|
|
778
|
+
AugmentParagraphs(
|
|
779
|
+
given=[
|
|
780
|
+
AugmentParagraph(
|
|
781
|
+
id=paragraph_id,
|
|
782
|
+
metadata=ParagraphMetadata(
|
|
783
|
+
is_an_image=text_blocks_by_id[paragraph_id].is_an_image,
|
|
784
|
+
is_a_table=text_blocks_by_id[paragraph_id].is_a_table,
|
|
785
|
+
source_file=text_blocks_by_id[paragraph_id].representation_file,
|
|
786
|
+
page=text_blocks_by_id[paragraph_id].position.page_number,
|
|
787
|
+
in_page_with_visual=text_blocks_by_id[paragraph_id].page_with_visual,
|
|
788
|
+
),
|
|
789
|
+
)
|
|
790
|
+
for paragraph_id in text_block_id_to_hydrate
|
|
791
|
+
],
|
|
792
|
+
text=True,
|
|
793
|
+
)
|
|
794
|
+
],
|
|
795
|
+
)
|
|
796
|
+
augment_response = await rpc.augment(kbid, augment_request)
|
|
797
|
+
augmented_paragraphs = augment_response.paragraphs
|
|
798
|
+
augmented_resources = augment_response.resources
|
|
799
|
+
|
|
800
|
+
# add hydrated text to our text blocks
|
|
801
|
+
for text_block in text_blocks:
|
|
802
|
+
augmented_paragraph = augmented_paragraphs.get(text_block.paragraph_id.full(), None)
|
|
803
|
+
if augmented_paragraph is not None and augmented_paragraph.text is not None:
|
|
804
|
+
if text_block_hydration_options.highlight:
|
|
805
|
+
text = highlight_paragraph(
|
|
806
|
+
augmented_paragraph.text, words=[], ematches=text_block_hydration_options.ematches
|
|
807
|
+
)
|
|
808
|
+
else:
|
|
809
|
+
text = augmented_paragraph.text
|
|
810
|
+
text_block.text = text
|
|
811
|
+
|
|
812
|
+
# with the hydrated text, rerank and apply new scores to the text blocks
|
|
813
|
+
to_rerank = [
|
|
814
|
+
RerankableItem(
|
|
815
|
+
id=text_block.paragraph_id.full(),
|
|
816
|
+
score=text_block.score,
|
|
817
|
+
score_type=text_block.score_type,
|
|
818
|
+
content=text_block.text or "", # TODO: add a warning, this shouldn't usually happen
|
|
819
|
+
)
|
|
820
|
+
for text_block in text_blocks
|
|
821
|
+
]
|
|
822
|
+
reranked = await reranker.rerank(to_rerank, reranking_options)
|
|
823
|
+
|
|
824
|
+
# after reranking, we can cut to the number of results the user wants, so we
|
|
825
|
+
# don't hydrate unnecessary stuff
|
|
826
|
+
reranked = reranked[:top_k]
|
|
827
|
+
|
|
828
|
+
matches = []
|
|
829
|
+
for item in reranked:
|
|
830
|
+
paragraph_id = item.id
|
|
831
|
+
score = item.score
|
|
832
|
+
score_type = item.score_type
|
|
833
|
+
|
|
834
|
+
text_block = text_blocks_by_id[paragraph_id]
|
|
835
|
+
text_block.scores.append(RerankerScore(score=score))
|
|
836
|
+
text_block.score_type = score_type
|
|
837
|
+
|
|
838
|
+
matches.append((paragraph_id, score))
|
|
839
|
+
|
|
840
|
+
matches.sort(key=lambda x: x[1], reverse=True)
|
|
841
|
+
|
|
842
|
+
best_matches = []
|
|
843
|
+
best_text_blocks = []
|
|
844
|
+
resources_to_hydrate.clear()
|
|
845
|
+
for order, (paragraph_id, _) in enumerate(matches):
|
|
846
|
+
text_block = text_blocks_by_id[paragraph_id]
|
|
847
|
+
text_block.order = order
|
|
848
|
+
best_matches.append(paragraph_id)
|
|
849
|
+
best_text_blocks.append(text_block)
|
|
850
|
+
|
|
851
|
+
# now we have removed the text block surplus, fetch resource metadata
|
|
852
|
+
if reranker.needs_extra_results:
|
|
853
|
+
rid = ParagraphId.from_string(paragraph_id).rid
|
|
854
|
+
resources_to_hydrate.add(rid)
|
|
855
|
+
|
|
856
|
+
# Finally, fetch resource metadata if we haven't already done it
|
|
857
|
+
if reranker.needs_extra_results:
|
|
858
|
+
resource_augment.given = list(resources_to_hydrate)
|
|
859
|
+
augmented = await rpc.augment(
|
|
860
|
+
kbid,
|
|
861
|
+
AugmentRequest(resources=[resource_augment]),
|
|
862
|
+
)
|
|
863
|
+
augmented_resources = augmented.resources
|
|
864
|
+
|
|
865
|
+
resources = [resource for resource in augmented_resources.values()]
|
|
866
|
+
|
|
867
|
+
return best_text_blocks, resources, best_matches
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
def compose_find_resources(
|
|
871
|
+
text_blocks: list[TextBlockMatch],
|
|
872
|
+
resources: list[AugmentedResource],
|
|
873
|
+
) -> dict[str, FindResource]:
|
|
874
|
+
find_resources: dict[str, FindResource] = {}
|
|
875
|
+
|
|
876
|
+
for resource in resources:
|
|
877
|
+
rid = resource.id
|
|
878
|
+
if rid not in find_resources:
|
|
879
|
+
find_resources[rid] = FindResource(id=rid, fields={})
|
|
880
|
+
find_resources[rid].updated_from(resource)
|
|
881
|
+
|
|
882
|
+
for text_block in text_blocks:
|
|
883
|
+
rid = text_block.paragraph_id.rid
|
|
884
|
+
if rid not in find_resources:
|
|
885
|
+
# resource not found in db, skipping
|
|
886
|
+
continue
|
|
887
|
+
|
|
888
|
+
find_resource = find_resources[rid]
|
|
889
|
+
field_id = text_block.paragraph_id.field_id.short_without_subfield()
|
|
890
|
+
find_field = find_resource.fields.setdefault(field_id, FindField(paragraphs={}))
|
|
891
|
+
|
|
892
|
+
paragraph_id = text_block.paragraph_id.full()
|
|
893
|
+
find_paragraph = text_block_to_find_paragraph(text_block)
|
|
894
|
+
|
|
895
|
+
find_field.paragraphs[paragraph_id] = find_paragraph
|
|
896
|
+
|
|
897
|
+
return find_resources
|