nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
import json
|
|
21
21
|
from time import time
|
|
22
|
-
from typing import Optional, Union
|
|
23
22
|
|
|
24
23
|
from fastapi import Request, Response
|
|
25
24
|
from fastapi_versioning import version
|
|
@@ -75,31 +74,28 @@ async def catalog_get(
|
|
|
75
74
|
response: Response,
|
|
76
75
|
kbid: str,
|
|
77
76
|
query: str = fastapi_query(SearchParamDefaults.query),
|
|
78
|
-
filter_expression:
|
|
77
|
+
filter_expression: str | None = fastapi_query(SearchParamDefaults.catalog_filter_expression),
|
|
79
78
|
filters: list[str] = fastapi_query(SearchParamDefaults.filters),
|
|
80
79
|
faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
|
|
81
80
|
sort_field: SortField = fastapi_query(SearchParamDefaults.sort_field),
|
|
82
|
-
sort_limit: Optional[int] = fastapi_query(SearchParamDefaults.sort_limit),
|
|
83
81
|
sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
|
|
84
82
|
page_number: int = fastapi_query(SearchParamDefaults.catalog_page_number),
|
|
85
83
|
page_size: int = fastapi_query(SearchParamDefaults.catalog_page_size),
|
|
86
|
-
with_status:
|
|
84
|
+
with_status: ResourceProcessingStatus | None = fastapi_query(
|
|
87
85
|
SearchParamDefaults.with_status, deprecated="Use filters instead"
|
|
88
86
|
),
|
|
89
87
|
debug: bool = fastapi_query(SearchParamDefaults.debug, include_in_schema=False),
|
|
90
|
-
range_creation_start:
|
|
91
|
-
range_creation_end:
|
|
92
|
-
range_modification_start:
|
|
88
|
+
range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
|
|
89
|
+
range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
|
|
90
|
+
range_modification_start: DateTime | None = fastapi_query(
|
|
93
91
|
SearchParamDefaults.range_modification_start
|
|
94
92
|
),
|
|
95
|
-
range_modification_end:
|
|
96
|
-
|
|
97
|
-
),
|
|
98
|
-
hidden: Optional[bool] = fastapi_query(SearchParamDefaults.hidden),
|
|
93
|
+
range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
|
|
94
|
+
hidden: bool | None = fastapi_query(SearchParamDefaults.hidden),
|
|
99
95
|
show: list[ResourceProperties] = fastapi_query(
|
|
100
96
|
SearchParamDefaults.show, default=[ResourceProperties.BASIC, ResourceProperties.ERRORS]
|
|
101
97
|
),
|
|
102
|
-
) ->
|
|
98
|
+
) -> CatalogResponse | HTTPClientError:
|
|
103
99
|
try:
|
|
104
100
|
expr = (
|
|
105
101
|
CatalogFilterExpression.model_validate_json(filter_expression) if filter_expression else None
|
|
@@ -125,7 +121,7 @@ async def catalog_get(
|
|
|
125
121
|
show=show,
|
|
126
122
|
)
|
|
127
123
|
if sort_field:
|
|
128
|
-
item.sort = SortOptions(field=sort_field,
|
|
124
|
+
item.sort = SortOptions(field=sort_field, order=sort_order)
|
|
129
125
|
return await catalog(kbid, item)
|
|
130
126
|
|
|
131
127
|
|
|
@@ -144,14 +140,14 @@ async def catalog_post(
|
|
|
144
140
|
request: Request,
|
|
145
141
|
kbid: str,
|
|
146
142
|
item: CatalogRequest,
|
|
147
|
-
) ->
|
|
143
|
+
) -> CatalogResponse | HTTPClientError:
|
|
148
144
|
return await catalog(kbid, item)
|
|
149
145
|
|
|
150
146
|
|
|
151
147
|
async def catalog(
|
|
152
148
|
kbid: str,
|
|
153
149
|
item: CatalogRequest,
|
|
154
|
-
) ->
|
|
150
|
+
) -> HTTPClientError | CatalogResponse:
|
|
155
151
|
"""
|
|
156
152
|
Catalog endpoint is a simplified version of the search endpoint, it only
|
|
157
153
|
returns bm25 results on titles and it does not support vector search.
|
nucliadb/search/api/v1/find.py
CHANGED
|
@@ -18,7 +18,6 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import json
|
|
21
|
-
from typing import Optional, Union
|
|
22
21
|
|
|
23
22
|
from fastapi import Body, Header, Query, Request, Response
|
|
24
23
|
from fastapi.openapi.models import Example
|
|
@@ -46,7 +45,6 @@ from nucliadb_models.search import (
|
|
|
46
45
|
KnowledgeboxFindResults,
|
|
47
46
|
NucliaDBClientType,
|
|
48
47
|
RankFusionName,
|
|
49
|
-
Reranker,
|
|
50
48
|
RerankerName,
|
|
51
49
|
ResourceProperties,
|
|
52
50
|
SearchParamDefaults,
|
|
@@ -84,33 +82,31 @@ async def find_knowledgebox(
|
|
|
84
82
|
response: Response,
|
|
85
83
|
kbid: str,
|
|
86
84
|
query: str = fastapi_query(SearchParamDefaults.query),
|
|
87
|
-
filter_expression:
|
|
85
|
+
filter_expression: str | None = fastapi_query(SearchParamDefaults.filter_expression),
|
|
88
86
|
fields: list[str] = fastapi_query(SearchParamDefaults.fields),
|
|
89
87
|
filters: list[str] = fastapi_query(SearchParamDefaults.filters),
|
|
90
|
-
top_k:
|
|
91
|
-
min_score:
|
|
88
|
+
top_k: int | None = fastapi_query(SearchParamDefaults.top_k),
|
|
89
|
+
min_score: float | None = Query(
|
|
92
90
|
default=None,
|
|
93
|
-
description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
|
|
91
|
+
description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
|
|
94
92
|
deprecated=True,
|
|
95
93
|
),
|
|
96
|
-
min_score_semantic:
|
|
94
|
+
min_score_semantic: float | None = Query(
|
|
97
95
|
default=None,
|
|
98
|
-
description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
|
|
96
|
+
description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
|
|
99
97
|
),
|
|
100
98
|
min_score_bm25: float = Query(
|
|
101
99
|
default=0,
|
|
102
100
|
description="Minimum bm25 score to filter paragraph and document index results",
|
|
103
101
|
ge=0,
|
|
104
102
|
),
|
|
105
|
-
vectorset:
|
|
106
|
-
range_creation_start:
|
|
107
|
-
range_creation_end:
|
|
108
|
-
range_modification_start:
|
|
103
|
+
vectorset: str | None = fastapi_query(SearchParamDefaults.vectorset),
|
|
104
|
+
range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
|
|
105
|
+
range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
|
|
106
|
+
range_modification_start: DateTime | None = fastapi_query(
|
|
109
107
|
SearchParamDefaults.range_modification_start
|
|
110
108
|
),
|
|
111
|
-
range_modification_end:
|
|
112
|
-
SearchParamDefaults.range_modification_end
|
|
113
|
-
),
|
|
109
|
+
range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
|
|
114
110
|
features: list[FindOptions] = fastapi_query(
|
|
115
111
|
SearchParamDefaults.search_features,
|
|
116
112
|
default=[
|
|
@@ -127,19 +123,18 @@ async def find_knowledgebox(
|
|
|
127
123
|
extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
|
|
128
124
|
with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
|
|
129
125
|
with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
|
|
130
|
-
autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
|
|
131
126
|
security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
|
|
132
127
|
show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
|
|
133
128
|
rank_fusion: RankFusionName = fastapi_query(SearchParamDefaults.rank_fusion),
|
|
134
|
-
reranker:
|
|
135
|
-
search_configuration:
|
|
129
|
+
reranker: RerankerName = fastapi_query(SearchParamDefaults.reranker),
|
|
130
|
+
search_configuration: str | None = Query(
|
|
136
131
|
default=None,
|
|
137
132
|
description="Load find parameters from this configuration. Parameters in the request override parameters from the configuration.",
|
|
138
133
|
),
|
|
139
134
|
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
|
140
135
|
x_nucliadb_user: str = Header(""),
|
|
141
136
|
x_forwarded_for: str = Header(""),
|
|
142
|
-
) ->
|
|
137
|
+
) -> KnowledgeboxFindResults | HTTPClientError:
|
|
143
138
|
try:
|
|
144
139
|
expr = FilterExpression.model_validate_json(filter_expression) if filter_expression else None
|
|
145
140
|
|
|
@@ -166,7 +161,6 @@ async def find_knowledgebox(
|
|
|
166
161
|
extracted=extracted,
|
|
167
162
|
with_duplicates=with_duplicates,
|
|
168
163
|
with_synonyms=with_synonyms,
|
|
169
|
-
autofilter=autofilter,
|
|
170
164
|
security=security,
|
|
171
165
|
show_hidden=show_hidden,
|
|
172
166
|
rank_fusion=rank_fusion,
|
|
@@ -198,7 +192,7 @@ async def find_post_knowledgebox(
|
|
|
198
192
|
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
|
199
193
|
x_nucliadb_user: str = Header(""),
|
|
200
194
|
x_forwarded_for: str = Header(""),
|
|
201
|
-
) ->
|
|
195
|
+
) -> KnowledgeboxFindResults | HTTPClientError:
|
|
202
196
|
return await _find_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
|
|
203
197
|
|
|
204
198
|
|
|
@@ -209,7 +203,7 @@ async def _find_endpoint(
|
|
|
209
203
|
x_ndb_client: NucliaDBClientType,
|
|
210
204
|
x_nucliadb_user: str,
|
|
211
205
|
x_forwarded_for: str,
|
|
212
|
-
) ->
|
|
206
|
+
) -> KnowledgeboxFindResults | HTTPClientError:
|
|
213
207
|
if item.search_configuration is not None:
|
|
214
208
|
search_config = await datamanagers.atomic.search_configurations.get(
|
|
215
209
|
kbid=kbid, name=item.search_configuration
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
|
-
from
|
|
21
|
+
from collections.abc import Awaitable
|
|
22
22
|
|
|
23
23
|
from async_lru import alru_cache
|
|
24
24
|
from fastapi import Request, Response
|
|
@@ -81,13 +81,13 @@ class HydratedBuilder:
|
|
|
81
81
|
self._resources: dict[str, HydratedResource] = {}
|
|
82
82
|
self._fields: dict[
|
|
83
83
|
str,
|
|
84
|
-
|
|
85
|
-
HydratedTextField
|
|
86
|
-
HydratedFileField
|
|
87
|
-
HydratedLinkField
|
|
88
|
-
HydratedConversationField
|
|
89
|
-
HydratedGenericField
|
|
90
|
-
|
|
84
|
+
(
|
|
85
|
+
HydratedTextField
|
|
86
|
+
| HydratedFileField
|
|
87
|
+
| HydratedLinkField
|
|
88
|
+
| HydratedConversationField
|
|
89
|
+
| HydratedGenericField
|
|
90
|
+
),
|
|
91
91
|
] = {}
|
|
92
92
|
self._paragraphs: dict[str, HydratedParagraph] = {}
|
|
93
93
|
|
|
@@ -100,13 +100,13 @@ class HydratedBuilder:
|
|
|
100
100
|
self,
|
|
101
101
|
) -> dict[
|
|
102
102
|
str,
|
|
103
|
-
|
|
104
|
-
HydratedTextField
|
|
105
|
-
HydratedFileField
|
|
106
|
-
HydratedLinkField
|
|
107
|
-
HydratedConversationField
|
|
108
|
-
HydratedGenericField
|
|
109
|
-
|
|
103
|
+
(
|
|
104
|
+
HydratedTextField
|
|
105
|
+
| HydratedFileField
|
|
106
|
+
| HydratedLinkField
|
|
107
|
+
| HydratedConversationField
|
|
108
|
+
| HydratedGenericField
|
|
109
|
+
),
|
|
110
110
|
]:
|
|
111
111
|
return self._fields
|
|
112
112
|
|
|
@@ -127,13 +127,13 @@ class HydratedBuilder:
|
|
|
127
127
|
def add_field(
|
|
128
128
|
self,
|
|
129
129
|
field_id: FieldId,
|
|
130
|
-
field:
|
|
131
|
-
HydratedTextField
|
|
132
|
-
HydratedFileField
|
|
133
|
-
HydratedLinkField
|
|
134
|
-
HydratedConversationField
|
|
135
|
-
HydratedGenericField
|
|
136
|
-
|
|
130
|
+
field: (
|
|
131
|
+
HydratedTextField
|
|
132
|
+
| HydratedFileField
|
|
133
|
+
| HydratedLinkField
|
|
134
|
+
| HydratedConversationField
|
|
135
|
+
| HydratedGenericField
|
|
136
|
+
),
|
|
137
137
|
):
|
|
138
138
|
self._fields[field_id.full()] = field
|
|
139
139
|
|
|
@@ -233,7 +233,7 @@ class Hydrator:
|
|
|
233
233
|
|
|
234
234
|
if field_id not in field_tasks:
|
|
235
235
|
field_tasks[field_id] = asyncio.create_task(
|
|
236
|
-
self._limited_concurrency(hydrate_field(
|
|
236
|
+
self._limited_concurrency(hydrate_field(field, field_id, self.config.field))
|
|
237
237
|
)
|
|
238
238
|
|
|
239
239
|
if rid not in resource_tasks:
|
|
@@ -323,6 +323,6 @@ class Hydrator:
|
|
|
323
323
|
async with self.max_ops:
|
|
324
324
|
return await aw
|
|
325
325
|
|
|
326
|
-
@alru_cache(maxsize=
|
|
327
|
-
async def cached_download_page_preview(self, field: Field, page: int) ->
|
|
326
|
+
@alru_cache(maxsize=50)
|
|
327
|
+
async def cached_download_page_preview(self, field: Field, page: int) -> Image | None:
|
|
328
328
|
return await download_page_preview(field, page)
|
|
@@ -18,7 +18,6 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
|
-
from typing import Optional
|
|
22
21
|
|
|
23
22
|
from fastapi import HTTPException, Request
|
|
24
23
|
from fastapi_versioning import version
|
|
@@ -178,7 +177,7 @@ async def get_node_index_counts(kbid: str) -> tuple[IndexCounts, list[str]]:
|
|
|
178
177
|
)
|
|
179
178
|
|
|
180
179
|
try:
|
|
181
|
-
results:
|
|
180
|
+
results: list[Shard] | None = await asyncio.wait_for(
|
|
182
181
|
asyncio.gather(*ops, return_exceptions=True), # type: ignore
|
|
183
182
|
timeout=settings.search_timeout,
|
|
184
183
|
)
|
|
@@ -18,7 +18,6 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import json
|
|
21
|
-
from typing import Union
|
|
22
21
|
|
|
23
22
|
from fastapi import Header, Request
|
|
24
23
|
from fastapi.responses import Response, StreamingResponse
|
|
@@ -68,7 +67,7 @@ async def predict_proxy_endpoint(
|
|
|
68
67
|
x_nucliadb_user: str = Header(""),
|
|
69
68
|
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
|
70
69
|
x_forwarded_for: str = Header(""),
|
|
71
|
-
) ->
|
|
70
|
+
) -> Response | StreamingResponse | HTTPClientError:
|
|
72
71
|
try:
|
|
73
72
|
payload = await request.json()
|
|
74
73
|
except json.JSONDecodeError:
|
|
@@ -17,14 +17,14 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from
|
|
20
|
+
from uuid import UUID
|
|
21
21
|
|
|
22
22
|
from fastapi import Header, Request, Response
|
|
23
23
|
from fastapi_versioning import version
|
|
24
24
|
from starlette.responses import StreamingResponse
|
|
25
25
|
|
|
26
|
+
from nucliadb.common import datamanagers
|
|
26
27
|
from nucliadb.models.responses import HTTPClientError
|
|
27
|
-
from nucliadb.search.api.v1.resource.utils import get_resource_uuid_by_slug
|
|
28
28
|
from nucliadb.search.api.v1.router import KB_PREFIX, RESOURCE_SLUG_PREFIX, api
|
|
29
29
|
from nucliadb_models.resource import NucliaDBRoles
|
|
30
30
|
from nucliadb_models.search import AskRequest, NucliaDBClientType, SyncAskResponse
|
|
@@ -47,7 +47,7 @@ from ..ask import create_ask_response
|
|
|
47
47
|
async def resource_ask_endpoint_by_uuid(
|
|
48
48
|
request: Request,
|
|
49
49
|
kbid: str,
|
|
50
|
-
rid:
|
|
50
|
+
rid: UUID,
|
|
51
51
|
item: AskRequest,
|
|
52
52
|
x_show_consumption: bool = Header(default=False),
|
|
53
53
|
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
|
@@ -58,7 +58,7 @@ async def resource_ask_endpoint_by_uuid(
|
|
|
58
58
|
description="When set to true, outputs response as JSON in a non-streaming way. "
|
|
59
59
|
"This is slower and requires waiting for entire answer to be ready.",
|
|
60
60
|
),
|
|
61
|
-
) ->
|
|
61
|
+
) -> StreamingResponse | HTTPClientError | Response:
|
|
62
62
|
current_user: NucliaUser = request.user
|
|
63
63
|
# If present, security groups from AuthorizationBackend overrides any
|
|
64
64
|
# security group of the payload
|
|
@@ -75,7 +75,7 @@ async def resource_ask_endpoint_by_uuid(
|
|
|
75
75
|
client_type=x_ndb_client,
|
|
76
76
|
origin=x_forwarded_for,
|
|
77
77
|
x_synchronous=x_synchronous,
|
|
78
|
-
resource=rid,
|
|
78
|
+
resource=str(rid),
|
|
79
79
|
extra_predict_headers={"X-Show-Consumption": str(x_show_consumption).lower()},
|
|
80
80
|
)
|
|
81
81
|
|
|
@@ -104,8 +104,8 @@ async def resource_ask_endpoint_by_slug(
|
|
|
104
104
|
description="When set to true, outputs response as JSON in a non-streaming way. "
|
|
105
105
|
"This is slower and requires waiting for entire answer to be ready.",
|
|
106
106
|
),
|
|
107
|
-
) ->
|
|
108
|
-
resource_id = await
|
|
107
|
+
) -> StreamingResponse | HTTPClientError | Response:
|
|
108
|
+
resource_id = await datamanagers.atomic.resources.get_resource_uuid_from_slug(kbid=kbid, slug=slug)
|
|
109
109
|
if resource_id is None:
|
|
110
110
|
return HTTPClientError(status_code=404, detail="Resource not found")
|
|
111
111
|
|
|
@@ -17,14 +17,13 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Union
|
|
21
20
|
|
|
22
21
|
from fastapi import Header, Request, Response
|
|
23
22
|
from fastapi_versioning import version
|
|
24
23
|
|
|
24
|
+
from nucliadb.common import datamanagers
|
|
25
25
|
from nucliadb.common.models_utils import from_proto
|
|
26
26
|
from nucliadb.models.responses import HTTPClientError
|
|
27
|
-
from nucliadb.search.api.v1.resource.utils import get_resource_uuid_by_slug
|
|
28
27
|
from nucliadb.search.api.v1.router import KB_PREFIX, RESOURCE_PREFIX, RESOURCE_SLUG_PREFIX, api
|
|
29
28
|
from nucliadb.search.predict_models import AugmentedField, RunAgentsResponse
|
|
30
29
|
from nucliadb.search.search.exceptions import ResourceNotFoundError
|
|
@@ -58,7 +57,7 @@ async def run_agents_by_uuid(
|
|
|
58
57
|
rid: str,
|
|
59
58
|
item: ResourceAgentsRequest,
|
|
60
59
|
x_nucliadb_user: str = Header(""),
|
|
61
|
-
) ->
|
|
60
|
+
) -> ResourceAgentsResponse | HTTPClientError:
|
|
62
61
|
return await _run_agents_endpoint(kbid, rid, x_nucliadb_user, item)
|
|
63
62
|
|
|
64
63
|
|
|
@@ -80,8 +79,8 @@ async def run_agents_by_slug(
|
|
|
80
79
|
slug: str,
|
|
81
80
|
item: ResourceAgentsRequest,
|
|
82
81
|
x_nucliadb_user: str = Header(""),
|
|
83
|
-
) ->
|
|
84
|
-
resource_id = await
|
|
82
|
+
) -> ResourceAgentsResponse | HTTPClientError:
|
|
83
|
+
resource_id = await datamanagers.atomic.resources.get_resource_uuid_from_slug(kbid=kbid, slug=slug)
|
|
85
84
|
if resource_id is None:
|
|
86
85
|
return HTTPClientError(status_code=404, detail="Resource not found")
|
|
87
86
|
return await _run_agents_endpoint(kbid, resource_id, x_nucliadb_user, item)
|
|
@@ -89,7 +88,7 @@ async def run_agents_by_slug(
|
|
|
89
88
|
|
|
90
89
|
async def _run_agents_endpoint(
|
|
91
90
|
kbid: str, resource_id: str, user_id: str, item: ResourceAgentsRequest
|
|
92
|
-
) ->
|
|
91
|
+
) -> ResourceAgentsResponse | HTTPClientError:
|
|
93
92
|
try:
|
|
94
93
|
run_agents_response: RunAgentsResponse = await run_agents(
|
|
95
94
|
kbid,
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import json
|
|
21
|
-
from typing import
|
|
21
|
+
from typing import cast
|
|
22
22
|
|
|
23
23
|
from fastapi import Header, Request, Response
|
|
24
24
|
from fastapi_versioning import version
|
|
@@ -62,25 +62,23 @@ async def resource_search(
|
|
|
62
62
|
kbid: str,
|
|
63
63
|
query: str,
|
|
64
64
|
rid: str,
|
|
65
|
-
filter_expression:
|
|
65
|
+
filter_expression: str | None = fastapi_query(SearchParamDefaults.filter_expression),
|
|
66
66
|
fields: list[str] = fastapi_query(SearchParamDefaults.fields),
|
|
67
67
|
filters: list[str] = fastapi_query(SearchParamDefaults.filters),
|
|
68
68
|
faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
|
|
69
|
-
sort:
|
|
69
|
+
sort: SortField | None = fastapi_query(SearchParamDefaults.sort_field, alias="sort_field"),
|
|
70
70
|
sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
|
|
71
|
-
top_k:
|
|
72
|
-
range_creation_start:
|
|
73
|
-
range_creation_end:
|
|
74
|
-
range_modification_start:
|
|
71
|
+
top_k: int | None = fastapi_query(SearchParamDefaults.top_k),
|
|
72
|
+
range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
|
|
73
|
+
range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
|
|
74
|
+
range_modification_start: DateTime | None = fastapi_query(
|
|
75
75
|
SearchParamDefaults.range_modification_start
|
|
76
76
|
),
|
|
77
|
-
range_modification_end:
|
|
78
|
-
SearchParamDefaults.range_modification_end
|
|
79
|
-
),
|
|
77
|
+
range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
|
|
80
78
|
highlight: bool = fastapi_query(SearchParamDefaults.highlight),
|
|
81
79
|
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
|
82
80
|
debug: bool = fastapi_query(SearchParamDefaults.debug),
|
|
83
|
-
) ->
|
|
81
|
+
) -> ResourceSearchResults | HTTPClientError:
|
|
84
82
|
top_k = top_k or SearchParamDefaults.top_k # type: ignore
|
|
85
83
|
top_k = cast(int, top_k)
|
|
86
84
|
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
from time import time
|
|
22
|
+
|
|
23
|
+
from fastapi import Header, HTTPException, Request
|
|
24
|
+
from fastapi_versioning import version
|
|
25
|
+
|
|
26
|
+
from nucliadb.common.exceptions import InvalidQueryError
|
|
27
|
+
from nucliadb.common.external_index_providers.base import TextBlockMatch
|
|
28
|
+
from nucliadb.common.models_utils import to_proto
|
|
29
|
+
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
|
30
|
+
from nucliadb.search.search.query_parser.parsers.retrieve import parse_retrieve
|
|
31
|
+
from nucliadb.search.search.retrieval import text_block_search
|
|
32
|
+
from nucliadb_models.resource import NucliaDBRoles
|
|
33
|
+
from nucliadb_models.retrieval import (
|
|
34
|
+
Metadata,
|
|
35
|
+
RetrievalMatch,
|
|
36
|
+
RetrievalRequest,
|
|
37
|
+
RetrievalResponse,
|
|
38
|
+
Scores,
|
|
39
|
+
)
|
|
40
|
+
from nucliadb_models.search import NucliaDBClientType
|
|
41
|
+
from nucliadb_utils.authentication import requires
|
|
42
|
+
from nucliadb_utils.utilities import get_audit
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@api.post(
|
|
46
|
+
f"/{KB_PREFIX}/{{kbid}}/retrieve",
|
|
47
|
+
status_code=200,
|
|
48
|
+
description="Search text blocks on a Knowledge Box",
|
|
49
|
+
include_in_schema=False,
|
|
50
|
+
tags=["Search"],
|
|
51
|
+
)
|
|
52
|
+
@requires(NucliaDBRoles.READER)
|
|
53
|
+
@version(1)
|
|
54
|
+
async def _retrieve_endpoint(
|
|
55
|
+
request: Request,
|
|
56
|
+
kbid: str,
|
|
57
|
+
item: RetrievalRequest,
|
|
58
|
+
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
|
59
|
+
x_nucliadb_user: str = Header(""),
|
|
60
|
+
x_forwarded_for: str = Header(""),
|
|
61
|
+
) -> RetrievalResponse:
|
|
62
|
+
return await retrieve_endpoint(
|
|
63
|
+
kbid,
|
|
64
|
+
item,
|
|
65
|
+
x_ndb_client=x_ndb_client,
|
|
66
|
+
x_nucliadb_user=x_nucliadb_user,
|
|
67
|
+
x_forwarded_for=x_forwarded_for,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
async def retrieve_endpoint(
|
|
72
|
+
kbid: str,
|
|
73
|
+
item: RetrievalRequest,
|
|
74
|
+
*,
|
|
75
|
+
x_ndb_client: NucliaDBClientType,
|
|
76
|
+
x_nucliadb_user: str,
|
|
77
|
+
x_forwarded_for: str,
|
|
78
|
+
) -> RetrievalResponse:
|
|
79
|
+
audit = get_audit()
|
|
80
|
+
start_time = time()
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
retrieval = await parse_retrieve(kbid, item)
|
|
84
|
+
except InvalidQueryError as err:
|
|
85
|
+
raise HTTPException(
|
|
86
|
+
status_code=422,
|
|
87
|
+
detail=str(err),
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
text_blocks, pb_query, _, _ = await text_block_search(kbid, retrieval)
|
|
91
|
+
|
|
92
|
+
# cut the top K, we may have more due to extra results used for rank fusion
|
|
93
|
+
text_blocks = text_blocks[: retrieval.top_k]
|
|
94
|
+
|
|
95
|
+
# convert to response models
|
|
96
|
+
matches = [text_block_match_to_retrieval_match(text_block) for text_block in text_blocks]
|
|
97
|
+
|
|
98
|
+
if audit is not None:
|
|
99
|
+
retrieval_time = time() - start_time
|
|
100
|
+
audit.retrieve(
|
|
101
|
+
kbid,
|
|
102
|
+
x_nucliadb_user,
|
|
103
|
+
to_proto.client_type(x_ndb_client),
|
|
104
|
+
x_forwarded_for,
|
|
105
|
+
retrieval_time,
|
|
106
|
+
# TODO(decoupled-ask): add interesting things to audit
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
return RetrievalResponse(matches=matches)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def text_block_match_to_retrieval_match(item: TextBlockMatch) -> RetrievalMatch:
|
|
113
|
+
return RetrievalMatch(
|
|
114
|
+
id=item.paragraph_id.full(),
|
|
115
|
+
score=Scores(
|
|
116
|
+
value=item.current_score.score,
|
|
117
|
+
source=item.current_score.source,
|
|
118
|
+
type=item.current_score.type,
|
|
119
|
+
history=item.scores,
|
|
120
|
+
),
|
|
121
|
+
metadata=Metadata(
|
|
122
|
+
field_labels=item.field_labels,
|
|
123
|
+
paragraph_labels=item.paragraph_labels,
|
|
124
|
+
is_an_image=item.is_an_image,
|
|
125
|
+
is_a_table=item.is_a_table,
|
|
126
|
+
source_file=item.representation_file,
|
|
127
|
+
page=item.position.page_number,
|
|
128
|
+
in_page_with_visual=item.page_with_visual,
|
|
129
|
+
),
|
|
130
|
+
)
|