nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/search/api/v1/search.py
CHANGED
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
import json
|
|
21
21
|
from time import time
|
|
22
|
-
from typing import Optional, Union
|
|
23
22
|
|
|
24
23
|
from fastapi import Body, Header, Query, Request, Response
|
|
25
24
|
from fastapi.openapi.models import Example
|
|
@@ -37,7 +36,10 @@ from nucliadb.search.requesters.utils import Method, nidx_query
|
|
|
37
36
|
from nucliadb.search.search import cache
|
|
38
37
|
from nucliadb.search.search.merge import merge_results
|
|
39
38
|
from nucliadb.search.search.query_parser.parsers.search import parse_search
|
|
40
|
-
from nucliadb.search.search.query_parser.parsers.unit_retrieval import
|
|
39
|
+
from nucliadb.search.search.query_parser.parsers.unit_retrieval import (
|
|
40
|
+
convert_retrieval_to_proto,
|
|
41
|
+
is_incomplete,
|
|
42
|
+
)
|
|
41
43
|
from nucliadb.search.search.utils import (
|
|
42
44
|
min_score_from_query_params,
|
|
43
45
|
)
|
|
@@ -65,7 +67,7 @@ from nucliadb_utils.utilities import get_audit
|
|
|
65
67
|
SEARCH_EXAMPLES = {
|
|
66
68
|
"filtering_by_icon": Example(
|
|
67
69
|
summary="Search for pdf documents where the text 'Noam Chomsky' appears",
|
|
68
|
-
description="For a complete list of filters, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets",
|
|
70
|
+
description="For a complete list of filters, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets",
|
|
69
71
|
value={
|
|
70
72
|
"query": "Noam Chomsky",
|
|
71
73
|
"filters": ["/icon/application/pdf"],
|
|
@@ -74,7 +76,7 @@ SEARCH_EXAMPLES = {
|
|
|
74
76
|
),
|
|
75
77
|
"get_language_counts": Example(
|
|
76
78
|
summary="Get the number of documents for each language",
|
|
77
|
-
description="For a complete list of facets, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets",
|
|
79
|
+
description="For a complete list of facets, visit: https://github.com/nuclia/nucliadb/blob/main/docs/internal/SEARCH.md#filters-and-facets",
|
|
78
80
|
value={
|
|
79
81
|
"page_size": 0,
|
|
80
82
|
"faceted": ["/s/p"],
|
|
@@ -88,7 +90,7 @@ SEARCH_EXAMPLES = {
|
|
|
88
90
|
f"/{KB_PREFIX}/{{kbid}}/search",
|
|
89
91
|
status_code=200,
|
|
90
92
|
summary="Search Knowledge Box",
|
|
91
|
-
description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`",
|
|
93
|
+
description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`",
|
|
92
94
|
response_model=KnowledgeboxSearchResults,
|
|
93
95
|
response_model_exclude_unset=True,
|
|
94
96
|
tags=["Search"],
|
|
@@ -100,37 +102,35 @@ async def search_knowledgebox(
|
|
|
100
102
|
response: Response,
|
|
101
103
|
kbid: str,
|
|
102
104
|
query: str = fastapi_query(SearchParamDefaults.query),
|
|
103
|
-
filter_expression:
|
|
105
|
+
filter_expression: str | None = fastapi_query(SearchParamDefaults.filter_expression),
|
|
104
106
|
fields: list[str] = fastapi_query(SearchParamDefaults.fields),
|
|
105
107
|
filters: list[str] = fastapi_query(SearchParamDefaults.filters),
|
|
106
108
|
faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
|
|
107
109
|
sort_field: SortField = fastapi_query(SearchParamDefaults.sort_field),
|
|
108
|
-
sort_limit: Optional[int] = fastapi_query(SearchParamDefaults.sort_limit),
|
|
109
110
|
sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
|
|
110
111
|
top_k: int = fastapi_query(SearchParamDefaults.top_k),
|
|
111
|
-
|
|
112
|
+
offset: int = fastapi_query(SearchParamDefaults.offset),
|
|
113
|
+
min_score: float | None = Query(
|
|
112
114
|
default=None,
|
|
113
|
-
description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
|
|
115
|
+
description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
|
|
114
116
|
deprecated=True,
|
|
115
117
|
),
|
|
116
|
-
min_score_semantic:
|
|
118
|
+
min_score_semantic: float | None = Query(
|
|
117
119
|
default=None,
|
|
118
|
-
description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
|
|
120
|
+
description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
|
|
119
121
|
),
|
|
120
122
|
min_score_bm25: float = Query(
|
|
121
123
|
default=0,
|
|
122
124
|
description="Minimum bm25 score to filter paragraph and document index results",
|
|
123
125
|
ge=0,
|
|
124
126
|
),
|
|
125
|
-
vectorset:
|
|
126
|
-
range_creation_start:
|
|
127
|
-
range_creation_end:
|
|
128
|
-
range_modification_start:
|
|
127
|
+
vectorset: str | None = fastapi_query(SearchParamDefaults.vectorset),
|
|
128
|
+
range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
|
|
129
|
+
range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
|
|
130
|
+
range_modification_start: DateTime | None = fastapi_query(
|
|
129
131
|
SearchParamDefaults.range_modification_start
|
|
130
132
|
),
|
|
131
|
-
range_modification_end:
|
|
132
|
-
SearchParamDefaults.range_modification_end
|
|
133
|
-
),
|
|
133
|
+
range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
|
|
134
134
|
features: list[SearchOptions] = fastapi_query(
|
|
135
135
|
SearchParamDefaults.search_features,
|
|
136
136
|
default=[
|
|
@@ -148,13 +148,12 @@ async def search_knowledgebox(
|
|
|
148
148
|
extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
|
|
149
149
|
with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
|
|
150
150
|
with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
|
|
151
|
-
autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
|
|
152
151
|
security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
|
|
153
152
|
show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
|
|
154
153
|
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
|
155
154
|
x_nucliadb_user: str = Header(""),
|
|
156
155
|
x_forwarded_for: str = Header(""),
|
|
157
|
-
) ->
|
|
156
|
+
) -> KnowledgeboxSearchResults | HTTPClientError:
|
|
158
157
|
try:
|
|
159
158
|
expr = FilterExpression.model_validate_json(filter_expression) if filter_expression else None
|
|
160
159
|
|
|
@@ -167,11 +166,7 @@ async def search_knowledgebox(
|
|
|
167
166
|
fields=fields,
|
|
168
167
|
filters=filters,
|
|
169
168
|
faceted=faceted,
|
|
170
|
-
sort=(
|
|
171
|
-
SortOptions(field=sort_field, limit=sort_limit, order=sort_order)
|
|
172
|
-
if sort_field is not None
|
|
173
|
-
else None
|
|
174
|
-
),
|
|
169
|
+
sort=(SortOptions(field=sort_field, order=sort_order) if sort_field is not None else None),
|
|
175
170
|
top_k=top_k,
|
|
176
171
|
min_score=min_score_from_query_params(min_score_bm25, min_score_semantic, min_score),
|
|
177
172
|
vectorset=vectorset,
|
|
@@ -187,9 +182,9 @@ async def search_knowledgebox(
|
|
|
187
182
|
extracted=extracted,
|
|
188
183
|
with_duplicates=with_duplicates,
|
|
189
184
|
with_synonyms=with_synonyms,
|
|
190
|
-
autofilter=autofilter,
|
|
191
185
|
security=security,
|
|
192
186
|
show_hidden=show_hidden,
|
|
187
|
+
offset=offset,
|
|
193
188
|
)
|
|
194
189
|
except ValidationError as exc:
|
|
195
190
|
detail = json.loads(exc.json())
|
|
@@ -201,7 +196,7 @@ async def search_knowledgebox(
|
|
|
201
196
|
f"/{KB_PREFIX}/{{kbid}}/search",
|
|
202
197
|
status_code=200,
|
|
203
198
|
summary="Search Knowledge Box",
|
|
204
|
-
description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`",
|
|
199
|
+
description="Search on a Knowledge Box and retrieve separate results for documents, paragraphs, and sentences. Usually, it is better to use `find`",
|
|
205
200
|
response_model=KnowledgeboxSearchResults,
|
|
206
201
|
response_model_exclude_unset=True,
|
|
207
202
|
tags=["Search"],
|
|
@@ -216,7 +211,7 @@ async def search_post_knowledgebox(
|
|
|
216
211
|
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
|
217
212
|
x_nucliadb_user: str = Header(""),
|
|
218
213
|
x_forwarded_for: str = Header(""),
|
|
219
|
-
) ->
|
|
214
|
+
) -> KnowledgeboxSearchResults | HTTPClientError:
|
|
220
215
|
return await _search_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
|
|
221
216
|
|
|
222
217
|
|
|
@@ -228,7 +223,7 @@ async def _search_endpoint(
|
|
|
228
223
|
x_nucliadb_user: str,
|
|
229
224
|
x_forwarded_for: str,
|
|
230
225
|
**kwargs,
|
|
231
|
-
) ->
|
|
226
|
+
) -> KnowledgeboxSearchResults | HTTPClientError:
|
|
232
227
|
try:
|
|
233
228
|
with cache.request_caches():
|
|
234
229
|
results, incomplete = await search(
|
|
@@ -256,13 +251,14 @@ async def search(
|
|
|
256
251
|
x_nucliadb_user: str,
|
|
257
252
|
x_forwarded_for: str,
|
|
258
253
|
do_audit: bool = True,
|
|
259
|
-
with_status:
|
|
254
|
+
with_status: ResourceProcessingStatus | None = None,
|
|
260
255
|
) -> tuple[KnowledgeboxSearchResults, bool]:
|
|
261
256
|
audit = get_audit()
|
|
262
257
|
start_time = time()
|
|
263
258
|
|
|
264
259
|
parsed = await parse_search(kbid, item)
|
|
265
|
-
|
|
260
|
+
incomplete_results = is_incomplete(parsed.retrieval)
|
|
261
|
+
pb_query = convert_retrieval_to_proto(parsed.retrieval)
|
|
266
262
|
|
|
267
263
|
# We need to query all nodes
|
|
268
264
|
results, queried_shards = await nidx_query(kbid, Method.SEARCH, pb_query)
|
|
@@ -276,6 +272,7 @@ async def search(
|
|
|
276
272
|
field_type_filter=item.field_type_filter,
|
|
277
273
|
extracted=item.extracted,
|
|
278
274
|
highlight=item.highlight,
|
|
275
|
+
offset=item.offset,
|
|
279
276
|
)
|
|
280
277
|
|
|
281
278
|
if audit is not None and do_audit:
|
|
@@ -290,5 +287,4 @@ async def search(
|
|
|
290
287
|
)
|
|
291
288
|
|
|
292
289
|
search_results.shards = queried_shards
|
|
293
|
-
search_results.autofilters = autofilters
|
|
294
290
|
return search_results, incomplete_results
|
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
import json
|
|
21
21
|
from datetime import datetime
|
|
22
|
-
from typing import Optional, Union
|
|
23
22
|
|
|
24
23
|
from fastapi import Header, Request, Response
|
|
25
24
|
from fastapi_versioning import version
|
|
@@ -64,20 +63,18 @@ async def suggest_knowledgebox(
|
|
|
64
63
|
response: Response,
|
|
65
64
|
kbid: str,
|
|
66
65
|
query: str = fastapi_query(SearchParamDefaults.suggest_query),
|
|
67
|
-
filter_expression:
|
|
66
|
+
filter_expression: str | None = fastapi_query(
|
|
68
67
|
SearchParamDefaults.filter_expression, include_in_schema=False
|
|
69
68
|
),
|
|
70
69
|
fields: list[str] = fastapi_query(SearchParamDefaults.fields),
|
|
71
70
|
filters: list[str] = fastapi_query(SearchParamDefaults.filters),
|
|
72
71
|
faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
|
|
73
|
-
range_creation_start:
|
|
74
|
-
range_creation_end:
|
|
75
|
-
range_modification_start:
|
|
72
|
+
range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
|
|
73
|
+
range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
|
|
74
|
+
range_modification_start: DateTime | None = fastapi_query(
|
|
76
75
|
SearchParamDefaults.range_modification_start
|
|
77
76
|
),
|
|
78
|
-
range_modification_end:
|
|
79
|
-
SearchParamDefaults.range_modification_end
|
|
80
|
-
),
|
|
77
|
+
range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
|
|
81
78
|
features: list[SuggestOptions] = fastapi_query(SearchParamDefaults.suggest_features),
|
|
82
79
|
show: list[ResourceProperties] = fastapi_query(SearchParamDefaults.show),
|
|
83
80
|
field_type_filter: list[FieldTypeName] = fastapi_query(
|
|
@@ -89,7 +86,7 @@ async def suggest_knowledgebox(
|
|
|
89
86
|
debug: bool = fastapi_query(SearchParamDefaults.debug),
|
|
90
87
|
highlight: bool = fastapi_query(SearchParamDefaults.highlight),
|
|
91
88
|
show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
|
|
92
|
-
) ->
|
|
89
|
+
) -> KnowledgeboxSuggestResults | HTTPClientError:
|
|
93
90
|
try:
|
|
94
91
|
expr = FilterExpression.model_validate_json(filter_expression) if filter_expression else None
|
|
95
92
|
|
|
@@ -126,14 +123,14 @@ async def suggest(
|
|
|
126
123
|
response,
|
|
127
124
|
kbid: str,
|
|
128
125
|
query: str,
|
|
129
|
-
filter_expression:
|
|
126
|
+
filter_expression: FilterExpression | None,
|
|
130
127
|
fields: list[str],
|
|
131
128
|
filters: list[str],
|
|
132
129
|
faceted: list[str],
|
|
133
|
-
range_creation_start:
|
|
134
|
-
range_creation_end:
|
|
135
|
-
range_modification_start:
|
|
136
|
-
range_modification_end:
|
|
130
|
+
range_creation_start: datetime | None,
|
|
131
|
+
range_creation_end: datetime | None,
|
|
132
|
+
range_modification_start: datetime | None,
|
|
133
|
+
range_modification_end: datetime | None,
|
|
137
134
|
features: list[SuggestOptions],
|
|
138
135
|
show: list[ResourceProperties],
|
|
139
136
|
field_type_filter: list[FieldTypeName],
|
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Union
|
|
21
20
|
|
|
22
21
|
from fastapi import Header, Request
|
|
23
22
|
from fastapi_versioning import version
|
|
@@ -48,7 +47,7 @@ async def summarize_endpoint(
|
|
|
48
47
|
kbid: str,
|
|
49
48
|
item: SummarizeRequest,
|
|
50
49
|
x_show_consumption: bool = Header(default=False),
|
|
51
|
-
) ->
|
|
50
|
+
) -> SummarizedResponse | HTTPClientError:
|
|
52
51
|
try:
|
|
53
52
|
return await summarize(
|
|
54
53
|
kbid=kbid,
|
nucliadb/search/api/v1/utils.py
CHANGED
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Any
|
|
20
|
+
from typing import Any
|
|
21
21
|
|
|
22
22
|
from fastapi import Query
|
|
23
23
|
|
|
@@ -26,7 +26,7 @@ from nucliadb_models.search import ParamDefault
|
|
|
26
26
|
_NOT_SET = object()
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def fastapi_query(param: ParamDefault, default:
|
|
29
|
+
def fastapi_query(param: ParamDefault, default: Any | None = _NOT_SET, **kw) -> Query: # type: ignore
|
|
30
30
|
# Be able to override default value
|
|
31
31
|
if default is _NOT_SET:
|
|
32
32
|
default_value = param.default
|
nucliadb/search/app.py
CHANGED
|
@@ -26,7 +26,7 @@ from starlette.middleware.authentication import AuthenticationMiddleware
|
|
|
26
26
|
from starlette.requests import ClientDisconnect, Request
|
|
27
27
|
from starlette.responses import HTMLResponse
|
|
28
28
|
|
|
29
|
-
from nucliadb.middleware import ProcessTimeHeaderMiddleware
|
|
29
|
+
from nucliadb.middleware import ClientErrorPayloadLoggerMiddleware, ProcessTimeHeaderMiddleware
|
|
30
30
|
from nucliadb.search import API_PREFIX
|
|
31
31
|
from nucliadb.search.api.v1.router import api as api_v1
|
|
32
32
|
from nucliadb.search.lifecycle import lifespan
|
|
@@ -47,6 +47,7 @@ middleware.extend(
|
|
|
47
47
|
[
|
|
48
48
|
Middleware(AuthenticationMiddleware, backend=NucliaCloudAuthenticationBackend()),
|
|
49
49
|
Middleware(AuditMiddleware, audit_utility_getter=get_audit),
|
|
50
|
+
Middleware(ClientErrorPayloadLoggerMiddleware),
|
|
50
51
|
]
|
|
51
52
|
)
|
|
52
53
|
|
|
@@ -58,7 +59,6 @@ errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
|
|
|
58
59
|
|
|
59
60
|
fastapi_settings = dict(
|
|
60
61
|
debug=running_settings.debug,
|
|
61
|
-
middleware=middleware,
|
|
62
62
|
lifespan=lifespan,
|
|
63
63
|
exception_handlers={
|
|
64
64
|
Exception: global_exception_handler,
|
|
@@ -78,6 +78,7 @@ application = VersionedFastAPI(
|
|
|
78
78
|
prefix_format=f"/{API_PREFIX}/v{{major}}",
|
|
79
79
|
default_version=(1, 0),
|
|
80
80
|
enable_latest=False,
|
|
81
|
+
middleware=middleware,
|
|
81
82
|
kwargs=fastapi_settings,
|
|
82
83
|
)
|
|
83
84
|
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
from . import fields, paragraphs, resources # noqa: F401
|
|
21
|
+
from .augmentor import augment # noqa: F401
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
import asyncio
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
from typing_extensions import assert_never
|
|
24
|
+
|
|
25
|
+
import nucliadb_models
|
|
26
|
+
from nucliadb.common import datamanagers
|
|
27
|
+
from nucliadb.common.ids import FIELD_TYPE_NAME_TO_STR, FieldId, ParagraphId
|
|
28
|
+
from nucliadb.models.internal.augment import (
|
|
29
|
+
Augment,
|
|
30
|
+
Augmented,
|
|
31
|
+
AugmentedField,
|
|
32
|
+
AugmentedParagraph,
|
|
33
|
+
AugmentedResource,
|
|
34
|
+
)
|
|
35
|
+
from nucliadb.search.augmentor.utils import limited_concurrency
|
|
36
|
+
from nucliadb.search.search.hydrator import ResourceHydrationOptions
|
|
37
|
+
from nucliadb_models.common import FieldTypeName
|
|
38
|
+
from nucliadb_models.resource import Resource
|
|
39
|
+
|
|
40
|
+
from .fields import augment_field
|
|
41
|
+
from .paragraphs import augment_paragraph
|
|
42
|
+
from .resources import augment_resource, augment_resource_deep
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def augment(
|
|
46
|
+
kbid: str,
|
|
47
|
+
augmentations: list[Augment],
|
|
48
|
+
*,
|
|
49
|
+
concurrency_control: asyncio.Semaphore | None = None,
|
|
50
|
+
) -> Augmented:
|
|
51
|
+
"""Process multiple augmentations concurrently and return the augmented content.
|
|
52
|
+
|
|
53
|
+
This is a heavy operation that can lead to many I/O operations with maindb
|
|
54
|
+
and/or blob storage. For improved performance, make sure this is called
|
|
55
|
+
inside the context of `nucliadb.search.search.cache` `request_caches`
|
|
56
|
+
|
|
57
|
+
"""
|
|
58
|
+
augments: dict[str, Any] = {
|
|
59
|
+
"resources": {},
|
|
60
|
+
"resources.deep": {},
|
|
61
|
+
"fields": {},
|
|
62
|
+
"paragraphs": {},
|
|
63
|
+
}
|
|
64
|
+
for augmentation in augmentations:
|
|
65
|
+
if augmentation.from_ == "resources":
|
|
66
|
+
for id in augmentation.given:
|
|
67
|
+
if isinstance(id, str):
|
|
68
|
+
rid = id
|
|
69
|
+
elif isinstance(id, FieldId):
|
|
70
|
+
rid = id.rid
|
|
71
|
+
elif isinstance(id, ParagraphId):
|
|
72
|
+
rid = id.rid
|
|
73
|
+
else: # pragma: no cover
|
|
74
|
+
assert_never(id)
|
|
75
|
+
|
|
76
|
+
augments["resources"].setdefault(rid, []).extend(augmentation.select)
|
|
77
|
+
|
|
78
|
+
elif augmentation.from_ == "resources.deep":
|
|
79
|
+
for rid in augmentation.given:
|
|
80
|
+
opts = augments["resources.deep"].setdefault(rid, ResourceHydrationOptions())
|
|
81
|
+
opts.show.extend(augmentation.show)
|
|
82
|
+
opts.extracted.extend(augmentation.extracted)
|
|
83
|
+
opts.field_type_filter.extend(augmentation.field_type_filter)
|
|
84
|
+
|
|
85
|
+
elif augmentation.from_ == "fields":
|
|
86
|
+
unfiltered_field_ids: list[FieldId] = []
|
|
87
|
+
for id in augmentation.given:
|
|
88
|
+
if isinstance(id, str):
|
|
89
|
+
# augmenting resource fields
|
|
90
|
+
rid = id
|
|
91
|
+
all_field_ids = await datamanagers.atomic.resources.get_all_field_ids(
|
|
92
|
+
kbid=kbid, rid=rid, for_update=False
|
|
93
|
+
)
|
|
94
|
+
if all_field_ids is None:
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
unfiltered_field_ids.extend(
|
|
98
|
+
FieldId.from_pb(
|
|
99
|
+
rid=rid, field_type=field_id_pb.field_type, key=field_id_pb.field
|
|
100
|
+
)
|
|
101
|
+
for field_id_pb in all_field_ids.fields
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
elif isinstance(id, FieldId):
|
|
105
|
+
unfiltered_field_ids.append(id)
|
|
106
|
+
|
|
107
|
+
elif isinstance(id, ParagraphId):
|
|
108
|
+
unfiltered_field_ids.append(id.field_id)
|
|
109
|
+
|
|
110
|
+
else: # pragma: no cover
|
|
111
|
+
assert_never(id)
|
|
112
|
+
|
|
113
|
+
if not augmentation.filter:
|
|
114
|
+
field_ids = unfiltered_field_ids
|
|
115
|
+
else:
|
|
116
|
+
field_ids = []
|
|
117
|
+
for field_id in unfiltered_field_ids:
|
|
118
|
+
for filter in augmentation.filter:
|
|
119
|
+
if isinstance(filter, nucliadb_models.filters.Field):
|
|
120
|
+
if filter.type == field_id.type and (
|
|
121
|
+
filter.name is None or filter.name == field_id.key
|
|
122
|
+
):
|
|
123
|
+
field_ids.append(field_id)
|
|
124
|
+
|
|
125
|
+
elif isinstance(filter, nucliadb_models.filters.Generated):
|
|
126
|
+
# generated fields are always text fields starting with "da-"
|
|
127
|
+
if field_id.type == FIELD_TYPE_NAME_TO_STR[FieldTypeName.TEXT] and (
|
|
128
|
+
filter.da_task is None
|
|
129
|
+
or field_id.key.startswith(f"da-{filter.da_task}-")
|
|
130
|
+
):
|
|
131
|
+
field_ids.append(field_id)
|
|
132
|
+
|
|
133
|
+
else: # pragma: no cover
|
|
134
|
+
assert_never(filter)
|
|
135
|
+
|
|
136
|
+
for field_id in field_ids:
|
|
137
|
+
augments["fields"].setdefault(field_id, []).extend(augmentation.select)
|
|
138
|
+
|
|
139
|
+
elif augmentation.from_ == "files" or augmentation.from_ == "conversations":
|
|
140
|
+
for id in augmentation.given:
|
|
141
|
+
if isinstance(id, FieldId):
|
|
142
|
+
field_id = id
|
|
143
|
+
elif isinstance(id, ParagraphId):
|
|
144
|
+
field_id = id.field_id
|
|
145
|
+
else: # pragma: no cover
|
|
146
|
+
assert_never(id)
|
|
147
|
+
|
|
148
|
+
augments["fields"].setdefault(field_id, []).extend(augmentation.select)
|
|
149
|
+
|
|
150
|
+
elif augmentation.from_ == "paragraphs":
|
|
151
|
+
for paragraph in augmentation.given:
|
|
152
|
+
select, metadata = augments["paragraphs"].setdefault(paragraph.id, ([], None))
|
|
153
|
+
select.extend(augmentation.select)
|
|
154
|
+
# we keep the first metadata object we see
|
|
155
|
+
metadata = metadata or paragraph.metadata
|
|
156
|
+
augments["paragraphs"][paragraph.id] = (select, metadata)
|
|
157
|
+
|
|
158
|
+
else: # pragma: no cover
|
|
159
|
+
assert_never(augmentation.from_)
|
|
160
|
+
|
|
161
|
+
ops = { # type: ignore[var-annotated]
|
|
162
|
+
"resources": [],
|
|
163
|
+
"resources.deep": [],
|
|
164
|
+
"fields": [],
|
|
165
|
+
"paragraphs": [],
|
|
166
|
+
}
|
|
167
|
+
for rid, select in augments["resources"].items():
|
|
168
|
+
task = asyncio.create_task(
|
|
169
|
+
limited_concurrency(
|
|
170
|
+
augment_resource( # type: ignore[arg-type]
|
|
171
|
+
kbid, rid, select
|
|
172
|
+
),
|
|
173
|
+
max_ops=concurrency_control,
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
ops["resources"].append(task)
|
|
177
|
+
|
|
178
|
+
for rid, opts in augments["resources.deep"].items():
|
|
179
|
+
task = asyncio.create_task(
|
|
180
|
+
limited_concurrency(
|
|
181
|
+
augment_resource_deep( # type: ignore[arg-type]
|
|
182
|
+
kbid, rid, opts
|
|
183
|
+
),
|
|
184
|
+
max_ops=concurrency_control,
|
|
185
|
+
)
|
|
186
|
+
)
|
|
187
|
+
ops["resources.deep"].append(task)
|
|
188
|
+
|
|
189
|
+
for field_id, select in augments["fields"].items():
|
|
190
|
+
task = asyncio.create_task(
|
|
191
|
+
limited_concurrency(
|
|
192
|
+
augment_field( # type: ignore[arg-type]
|
|
193
|
+
kbid, field_id, select
|
|
194
|
+
),
|
|
195
|
+
max_ops=concurrency_control,
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
ops["fields"].append(task)
|
|
199
|
+
|
|
200
|
+
for paragraph_id, (select, metadata) in augments["paragraphs"].items():
|
|
201
|
+
task = asyncio.create_task(
|
|
202
|
+
limited_concurrency(
|
|
203
|
+
augment_paragraph( # type: ignore[arg-type]
|
|
204
|
+
kbid, paragraph_id, select, metadata
|
|
205
|
+
),
|
|
206
|
+
max_ops=concurrency_control,
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
ops["paragraphs"].append(task)
|
|
210
|
+
|
|
211
|
+
results = await asyncio.gather(
|
|
212
|
+
*ops["resources"], *ops["resources.deep"], *ops["fields"], *ops["paragraphs"]
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
resources: list[AugmentedResource] = results[: len(ops["resources"])]
|
|
216
|
+
del results[: len(ops["resources"])]
|
|
217
|
+
resources_deep: list[Resource] = results[: len(ops["resources.deep"])]
|
|
218
|
+
del results[: len(ops["resources.deep"])]
|
|
219
|
+
fields: list[AugmentedField] = results[: len(ops["fields"])]
|
|
220
|
+
del results[: len(ops["fields"])]
|
|
221
|
+
paragraphs: list[AugmentedParagraph] = results[: len(ops["paragraphs"])]
|
|
222
|
+
|
|
223
|
+
return Augmented(
|
|
224
|
+
resources={resource.id: resource for resource in resources if resource is not None},
|
|
225
|
+
resources_deep={
|
|
226
|
+
resource_deep.id: resource_deep
|
|
227
|
+
for resource_deep in resources_deep
|
|
228
|
+
if resource_deep is not None
|
|
229
|
+
},
|
|
230
|
+
fields={field.id: field for field in fields if field is not None},
|
|
231
|
+
paragraphs={paragraph.id: paragraph for paragraph in paragraphs if paragraph is not None},
|
|
232
|
+
)
|