nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
- migrations/0017_multiple_writable_shards.py +1 -1
- migrations/0018_purge_orphan_kbslugs.py +1 -1
- migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
- migrations/0021_overwrite_vectorsets_key.py +1 -1
- migrations/0023_backfill_pg_catalog.py +7 -3
- migrations/0025_assign_models_to_kbs_v2.py +3 -3
- migrations/0027_rollover_texts3.py +1 -1
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +1 -1
- migrations/0032_remove_old_relations.py +1 -1
- migrations/0036_backfill_catalog_slug.py +1 -1
- migrations/0037_backfill_catalog_facets.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +7 -3
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/backups/create.py +3 -3
- nucliadb/backups/restore.py +3 -3
- nucliadb/common/cache.py +1 -1
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/settings.py +1 -1
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/datamanagers/utils.py +2 -2
- nucliadb/common/external_index_providers/manager.py +1 -29
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +16 -33
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +4 -0
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +77 -55
- nucliadb/common/locking.py +4 -4
- nucliadb/common/maindb/driver.py +11 -1
- nucliadb/common/maindb/local.py +1 -1
- nucliadb/common/maindb/pg.py +1 -1
- nucliadb/common/nidx.py +19 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +3 -3
- nucliadb/ingest/consumer/pull.py +7 -0
- nucliadb/ingest/consumer/service.py +2 -27
- nucliadb/ingest/consumer/shard_creator.py +17 -6
- nucliadb/ingest/fields/base.py +9 -17
- nucliadb/ingest/fields/conversation.py +47 -1
- nucliadb/ingest/orm/brain_v2.py +21 -3
- nucliadb/ingest/orm/index_message.py +126 -111
- nucliadb/ingest/orm/knowledgebox.py +84 -43
- nucliadb/ingest/orm/processor/auditing.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +95 -149
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/serialize.py +2 -2
- nucliadb/ingest/service/writer.py +26 -19
- nucliadb/ingest/settings.py +33 -11
- nucliadb/learning_proxy.py +12 -15
- nucliadb/metrics_exporter.py +17 -4
- nucliadb/migrator/datamanager.py +11 -17
- nucliadb/migrator/migrator.py +2 -2
- nucliadb/purge/__init__.py +12 -17
- nucliadb/purge/orphan_shards.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +40 -12
- nucliadb/reader/api/v1/learning_config.py +30 -10
- nucliadb/reader/api/v1/resource.py +2 -2
- nucliadb/reader/api/v1/services.py +1 -1
- nucliadb/reader/reader/notifications.py +1 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/predict.py +9 -2
- nucliadb/search/search/cache.py +1 -20
- nucliadb/search/search/chat/ask.py +50 -8
- nucliadb/search/search/chat/prompt.py +47 -15
- nucliadb/search/search/chat/query.py +8 -1
- nucliadb/search/search/fetch.py +1 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +307 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- nucliadb/search/search/metrics.py +16 -0
- nucliadb/search/search/predict_proxy.py +33 -11
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/fetcher.py +5 -5
- nucliadb/search/search/query_parser/models.py +1 -30
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
- nucliadb/search/search/query_parser/parsers/common.py +16 -7
- nucliadb/search/search/query_parser/parsers/find.py +0 -11
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/query_parser/parsers/search.py +0 -11
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/search/search/rerankers.py +1 -1
- nucliadb/search/search/summarize.py +1 -1
- nucliadb/standalone/run.py +3 -0
- nucliadb/tasks/retries.py +4 -4
- nucliadb/train/generators/sentence_classifier.py +2 -8
- nucliadb/train/generators/utils.py +1 -1
- nucliadb/train/nodes.py +4 -4
- nucliadb/train/servicer.py +1 -1
- nucliadb/train/uploader.py +1 -1
- nucliadb/writer/api/v1/field.py +14 -9
- nucliadb/writer/api/v1/knowledgebox.py +15 -52
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +2 -2
- nucliadb/writer/resource/field.py +38 -2
- nucliadb/writer/tus/azure.py +4 -4
- nucliadb/writer/tus/gcs.py +11 -17
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
#
|
|
20
20
|
from typing import Dict
|
|
21
21
|
|
|
22
|
-
from fastapi import Request
|
|
22
|
+
from fastapi import Header, Request
|
|
23
23
|
from fastapi_versioning import version
|
|
24
24
|
from nuclia_models.config.proto import ExtractConfig, SplitConfiguration
|
|
25
25
|
|
|
@@ -60,15 +60,11 @@ async def download_model(
|
|
|
60
60
|
)
|
|
61
61
|
@requires_one([NucliaDBRoles.READER, NucliaDBRoles.MANAGER])
|
|
62
62
|
@version(1)
|
|
63
|
-
async def get_configuration(
|
|
64
|
-
request: Request,
|
|
65
|
-
kbid: str,
|
|
66
|
-
):
|
|
63
|
+
async def get_configuration(request: Request, kbid: str):
|
|
67
64
|
return await learning_config_proxy(
|
|
68
65
|
request,
|
|
69
66
|
"GET",
|
|
70
67
|
f"/config/{kbid}",
|
|
71
|
-
extra_headers={"X-STF-USER": request.headers.get("X-NUCLIADB-USER", "")},
|
|
72
68
|
)
|
|
73
69
|
|
|
74
70
|
|
|
@@ -108,7 +104,6 @@ async def get_model(
|
|
|
108
104
|
request,
|
|
109
105
|
"GET",
|
|
110
106
|
f"/models/{kbid}/model/{model_id}",
|
|
111
|
-
extra_headers={"X-STF-USER": request.headers.get("X-NUCLIADB-USER", "")},
|
|
112
107
|
)
|
|
113
108
|
|
|
114
109
|
|
|
@@ -123,10 +118,35 @@ async def get_model(
|
|
|
123
118
|
@requires_one([NucliaDBRoles.READER, NucliaDBRoles.MANAGER])
|
|
124
119
|
@version(1)
|
|
125
120
|
async def get_schema_for_configuration_updates(
|
|
126
|
-
request: Request,
|
|
127
|
-
kbid: str,
|
|
121
|
+
request: Request, kbid: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
|
|
128
122
|
):
|
|
129
|
-
return await learning_config_proxy(
|
|
123
|
+
return await learning_config_proxy(
|
|
124
|
+
request,
|
|
125
|
+
"GET",
|
|
126
|
+
f"/schema/{kbid}",
|
|
127
|
+
headers={"account-id": x_nucliadb_account},
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@api.get(
|
|
132
|
+
path=f"/{KB_PREFIX}/{{kbid}}/generative_providers",
|
|
133
|
+
status_code=200,
|
|
134
|
+
summary="Available models for a knowledge box",
|
|
135
|
+
description="Get all available models for a knowledge box grouped by provider",
|
|
136
|
+
response_model=None,
|
|
137
|
+
tags=["Models"],
|
|
138
|
+
)
|
|
139
|
+
@requires_one([NucliaDBRoles.READER, NucliaDBRoles.MANAGER])
|
|
140
|
+
@version(1)
|
|
141
|
+
async def get_models_group_by_providers(
|
|
142
|
+
request: Request, kbid: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
|
|
143
|
+
):
|
|
144
|
+
return await learning_config_proxy(
|
|
145
|
+
request,
|
|
146
|
+
"GET",
|
|
147
|
+
f"/generative_providers/{kbid}",
|
|
148
|
+
headers={"account-id": x_nucliadb_account},
|
|
149
|
+
)
|
|
130
150
|
|
|
131
151
|
|
|
132
152
|
@api.get(
|
|
@@ -77,7 +77,7 @@ async def list_resources(
|
|
|
77
77
|
|
|
78
78
|
# Get counters from maindb
|
|
79
79
|
driver = get_driver()
|
|
80
|
-
async with driver.
|
|
80
|
+
async with driver.ro_transaction() as txn:
|
|
81
81
|
# Filter parameters for serializer
|
|
82
82
|
show: list[ResourceProperties] = [ResourceProperties.BASIC]
|
|
83
83
|
field_types: list[FieldTypeName] = []
|
|
@@ -335,7 +335,7 @@ async def _get_resource_field(
|
|
|
335
335
|
storage = await get_storage(service_name=SERVICE_NAME)
|
|
336
336
|
driver = get_driver()
|
|
337
337
|
pb_field_id = to_proto.field_type_name(field_type)
|
|
338
|
-
async with driver.
|
|
338
|
+
async with driver.ro_transaction() as txn:
|
|
339
339
|
kb = ORMKnowledgeBox(txn, storage, kbid)
|
|
340
340
|
|
|
341
341
|
if rid is None:
|
|
@@ -287,7 +287,7 @@ async def processing_status(
|
|
|
287
287
|
storage = await get_storage(service_name=SERVICE_NAME)
|
|
288
288
|
driver = get_driver()
|
|
289
289
|
|
|
290
|
-
async with driver.
|
|
290
|
+
async with driver.ro_transaction() as txn:
|
|
291
291
|
kb = KnowledgeBox(txn, storage, kbid)
|
|
292
292
|
|
|
293
293
|
max_simultaneous = asyncio.Semaphore(10)
|
|
@@ -201,7 +201,7 @@ async def get_resource_title_cached(
|
|
|
201
201
|
|
|
202
202
|
|
|
203
203
|
async def get_resource_title(kv_driver: Driver, kbid: str, resource_uuid: str) -> Optional[str]:
|
|
204
|
-
async with kv_driver.
|
|
204
|
+
async with kv_driver.ro_transaction() as txn:
|
|
205
205
|
basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=resource_uuid)
|
|
206
206
|
if basic is None:
|
|
207
207
|
return None
|
|
@@ -25,6 +25,7 @@ from fastapi import Request, Response
|
|
|
25
25
|
from fastapi_versioning import version
|
|
26
26
|
from pydantic import ValidationError
|
|
27
27
|
|
|
28
|
+
from nucliadb.common.catalog import catalog_facets, catalog_search
|
|
28
29
|
from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
|
|
29
30
|
from nucliadb.common.exceptions import InvalidQueryError
|
|
30
31
|
from nucliadb.models.responses import HTTPClientError
|
|
@@ -33,7 +34,6 @@ from nucliadb.search.api.v1.router import KB_PREFIX, api
|
|
|
33
34
|
from nucliadb.search.api.v1.utils import fastapi_query
|
|
34
35
|
from nucliadb.search.search import cache
|
|
35
36
|
from nucliadb.search.search.merge import fetch_resources
|
|
36
|
-
from nucliadb.search.search.pgcatalog import pgcatalog_facets, pgcatalog_search
|
|
37
37
|
from nucliadb.search.search.query_parser.parsers import parse_catalog
|
|
38
38
|
from nucliadb.search.search.utils import (
|
|
39
39
|
maybe_log_request_payload,
|
|
@@ -164,7 +164,7 @@ async def catalog(
|
|
|
164
164
|
query_parser = await parse_catalog(kbid, item)
|
|
165
165
|
|
|
166
166
|
catalog_results = CatalogResponse()
|
|
167
|
-
catalog_results.fulltext = await
|
|
167
|
+
catalog_results.fulltext = await catalog_search(query_parser)
|
|
168
168
|
catalog_results.resources = await fetch_resources(
|
|
169
169
|
resources=[r.rid for r in catalog_results.fulltext.results],
|
|
170
170
|
kbid=kbid,
|
|
@@ -205,7 +205,7 @@ async def catalog(
|
|
|
205
205
|
)
|
|
206
206
|
@requires(NucliaDBRoles.READER)
|
|
207
207
|
@version(1)
|
|
208
|
-
async def
|
|
208
|
+
async def catalog_facets_endpoint(
|
|
209
209
|
request: Request, kbid: str, item: CatalogFacetsRequest
|
|
210
210
|
) -> CatalogFacetsResponse:
|
|
211
|
-
return CatalogFacetsResponse(facets=await
|
|
211
|
+
return CatalogFacetsResponse(facets=await catalog_facets(kbid, item))
|
nucliadb/search/api/v1/find.py
CHANGED
|
@@ -46,7 +46,6 @@ from nucliadb_models.search import (
|
|
|
46
46
|
KnowledgeboxFindResults,
|
|
47
47
|
NucliaDBClientType,
|
|
48
48
|
RankFusionName,
|
|
49
|
-
Reranker,
|
|
50
49
|
RerankerName,
|
|
51
50
|
ResourceProperties,
|
|
52
51
|
SearchParamDefaults,
|
|
@@ -127,11 +126,10 @@ async def find_knowledgebox(
|
|
|
127
126
|
extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
|
|
128
127
|
with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
|
|
129
128
|
with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
|
|
130
|
-
autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
|
|
131
129
|
security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
|
|
132
130
|
show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
|
|
133
131
|
rank_fusion: RankFusionName = fastapi_query(SearchParamDefaults.rank_fusion),
|
|
134
|
-
reranker:
|
|
132
|
+
reranker: RerankerName = fastapi_query(SearchParamDefaults.reranker),
|
|
135
133
|
search_configuration: Optional[str] = Query(
|
|
136
134
|
default=None,
|
|
137
135
|
description="Load find parameters from this configuration. Parameters in the request override parameters from the configuration.",
|
|
@@ -166,7 +164,6 @@ async def find_knowledgebox(
|
|
|
166
164
|
extracted=extracted,
|
|
167
165
|
with_duplicates=with_duplicates,
|
|
168
166
|
with_synonyms=with_synonyms,
|
|
169
|
-
autofilter=autofilter,
|
|
170
167
|
security=security,
|
|
171
168
|
show_hidden=show_hidden,
|
|
172
169
|
rank_fusion=rank_fusion,
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
import asyncio
|
|
21
|
+
from typing import Awaitable, Optional, Union
|
|
22
|
+
|
|
23
|
+
from async_lru import alru_cache
|
|
24
|
+
from fastapi import Request, Response
|
|
25
|
+
from fastapi_versioning import version
|
|
26
|
+
|
|
27
|
+
from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, FieldId, ParagraphId
|
|
28
|
+
from nucliadb.ingest.fields.base import Field
|
|
29
|
+
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
|
30
|
+
from nucliadb.search.search import cache
|
|
31
|
+
from nucliadb.search.search.cache import request_caches
|
|
32
|
+
from nucliadb.search.search.hydrator.fields import hydrate_field, page_preview_id
|
|
33
|
+
from nucliadb.search.search.hydrator.images import (
|
|
34
|
+
download_page_preview,
|
|
35
|
+
)
|
|
36
|
+
from nucliadb.search.search.hydrator.paragraphs import ParagraphIndex, hydrate_paragraph
|
|
37
|
+
from nucliadb.search.search.hydrator.resources import hydrate_resource
|
|
38
|
+
from nucliadb_models.hydration import (
|
|
39
|
+
Hydrated,
|
|
40
|
+
HydratedConversationField,
|
|
41
|
+
HydratedFileField,
|
|
42
|
+
HydratedGenericField,
|
|
43
|
+
HydratedLinkField,
|
|
44
|
+
HydratedParagraph,
|
|
45
|
+
HydratedResource,
|
|
46
|
+
HydratedTextField,
|
|
47
|
+
HydrateRequest,
|
|
48
|
+
Hydration,
|
|
49
|
+
ParagraphHydration,
|
|
50
|
+
)
|
|
51
|
+
from nucliadb_models.resource import NucliaDBRoles
|
|
52
|
+
from nucliadb_models.search import Image
|
|
53
|
+
from nucliadb_utils.authentication import requires
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@api.post(
|
|
57
|
+
f"/{KB_PREFIX}/{{kbid}}/hydrate",
|
|
58
|
+
status_code=200,
|
|
59
|
+
summary="Hydrate a set of paragraphs",
|
|
60
|
+
description="Internal API endpoint to hydrate a set of paragraphs",
|
|
61
|
+
include_in_schema=False,
|
|
62
|
+
response_model_exclude_unset=True,
|
|
63
|
+
tags=["Hydration"],
|
|
64
|
+
)
|
|
65
|
+
@requires(NucliaDBRoles.READER)
|
|
66
|
+
@version(1)
|
|
67
|
+
async def hydrate_endpoint(
|
|
68
|
+
request: Request,
|
|
69
|
+
response: Response,
|
|
70
|
+
kbid: str,
|
|
71
|
+
item: HydrateRequest,
|
|
72
|
+
) -> Hydrated:
|
|
73
|
+
with request_caches():
|
|
74
|
+
return await Hydrator(kbid, item.hydration).hydrate(item.data)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class HydratedBuilder:
|
|
78
|
+
"""Builder class to construct an Hydrated payload."""
|
|
79
|
+
|
|
80
|
+
def __init__(self) -> None:
|
|
81
|
+
self._resources: dict[str, HydratedResource] = {}
|
|
82
|
+
self._fields: dict[
|
|
83
|
+
str,
|
|
84
|
+
Union[
|
|
85
|
+
HydratedTextField,
|
|
86
|
+
HydratedFileField,
|
|
87
|
+
HydratedLinkField,
|
|
88
|
+
HydratedConversationField,
|
|
89
|
+
HydratedGenericField,
|
|
90
|
+
],
|
|
91
|
+
] = {}
|
|
92
|
+
self._paragraphs: dict[str, HydratedParagraph] = {}
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def resources(self) -> dict[str, HydratedResource]:
|
|
96
|
+
return self._resources
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def fields(
|
|
100
|
+
self,
|
|
101
|
+
) -> dict[
|
|
102
|
+
str,
|
|
103
|
+
Union[
|
|
104
|
+
HydratedTextField,
|
|
105
|
+
HydratedFileField,
|
|
106
|
+
HydratedLinkField,
|
|
107
|
+
HydratedConversationField,
|
|
108
|
+
HydratedGenericField,
|
|
109
|
+
],
|
|
110
|
+
]:
|
|
111
|
+
return self._fields
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def paragraphs(self) -> dict[str, HydratedParagraph]:
|
|
115
|
+
return self._paragraphs
|
|
116
|
+
|
|
117
|
+
def build(self) -> Hydrated:
|
|
118
|
+
return Hydrated(
|
|
119
|
+
resources=self._resources,
|
|
120
|
+
fields=self._fields,
|
|
121
|
+
paragraphs=self._paragraphs,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def add_resource(self, rid: str, resource: HydratedResource):
|
|
125
|
+
self._resources[rid] = resource
|
|
126
|
+
|
|
127
|
+
def add_field(
|
|
128
|
+
self,
|
|
129
|
+
field_id: FieldId,
|
|
130
|
+
field: Union[
|
|
131
|
+
HydratedTextField,
|
|
132
|
+
HydratedFileField,
|
|
133
|
+
HydratedLinkField,
|
|
134
|
+
HydratedConversationField,
|
|
135
|
+
HydratedGenericField,
|
|
136
|
+
],
|
|
137
|
+
):
|
|
138
|
+
self._fields[field_id.full()] = field
|
|
139
|
+
|
|
140
|
+
def has_field(self, field_id: FieldId) -> bool:
|
|
141
|
+
return field_id.full() in self._fields
|
|
142
|
+
|
|
143
|
+
def add_paragraph(self, paragraph_id: ParagraphId, paragraph: HydratedParagraph):
|
|
144
|
+
self._paragraphs[paragraph_id.full()] = paragraph
|
|
145
|
+
|
|
146
|
+
def add_page_preview(self, paragraph_id: ParagraphId, page: int, image: Image):
|
|
147
|
+
field_id = paragraph_id.field_id
|
|
148
|
+
field = self._fields[field_id.full()]
|
|
149
|
+
|
|
150
|
+
if not isinstance(field, HydratedFileField):
|
|
151
|
+
# Other field types have no page preview concept
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
if field.previews is None:
|
|
155
|
+
field.previews = {}
|
|
156
|
+
|
|
157
|
+
preview_id = page_preview_id(page)
|
|
158
|
+
field.previews[preview_id] = image
|
|
159
|
+
|
|
160
|
+
paragraph = self._paragraphs[paragraph_id.full()]
|
|
161
|
+
assert paragraph.page is not None, "should already be set"
|
|
162
|
+
paragraph.page.page_preview_ref = preview_id
|
|
163
|
+
|
|
164
|
+
def add_table_page_preview(self, paragraph_id: ParagraphId, page: int, image: Image):
|
|
165
|
+
field_id = paragraph_id.field_id
|
|
166
|
+
field = self._fields[field_id.full()]
|
|
167
|
+
|
|
168
|
+
if not isinstance(field, HydratedFileField):
|
|
169
|
+
# Other field types have no page preview concept
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
if field.previews is None:
|
|
173
|
+
field.previews = {}
|
|
174
|
+
|
|
175
|
+
preview_id = page_preview_id(page)
|
|
176
|
+
field.previews[preview_id] = image
|
|
177
|
+
|
|
178
|
+
paragraph = self._paragraphs[paragraph_id.full()]
|
|
179
|
+
assert paragraph.table is not None, "should already be set"
|
|
180
|
+
paragraph.table.page_preview_ref = preview_id
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class Hydrator:
|
|
184
|
+
def __init__(self, kbid: str, config: Hydration):
|
|
185
|
+
self.kbid = kbid
|
|
186
|
+
self.config = config
|
|
187
|
+
self.hydrated = HydratedBuilder()
|
|
188
|
+
|
|
189
|
+
# cached paragraphs per field
|
|
190
|
+
self.field_paragraphs: dict[FieldId, ParagraphIndex] = {}
|
|
191
|
+
|
|
192
|
+
self.max_ops = asyncio.Semaphore(50)
|
|
193
|
+
|
|
194
|
+
async def hydrate(self, paragraph_ids: list[str]) -> Hydrated:
|
|
195
|
+
paragraph_tasks = {}
|
|
196
|
+
field_tasks = {}
|
|
197
|
+
resource_tasks = {}
|
|
198
|
+
|
|
199
|
+
unique_paragraph_ids = set(paragraph_ids)
|
|
200
|
+
for user_paragraph_id in unique_paragraph_ids:
|
|
201
|
+
try:
|
|
202
|
+
paragraph_id = ParagraphId.from_string(user_paragraph_id)
|
|
203
|
+
except ValueError:
|
|
204
|
+
# skip paragraphs with invalid format
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
field_id = paragraph_id.field_id
|
|
208
|
+
rid = paragraph_id.rid
|
|
209
|
+
|
|
210
|
+
resource = await cache.get_resource(self.kbid, rid)
|
|
211
|
+
if resource is None:
|
|
212
|
+
# skip resources that aren't in the DB
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
|
|
216
|
+
if not (await resource.field_exists(field_type_pb, field_id.key)):
|
|
217
|
+
# skip a fields that aren't in the DB
|
|
218
|
+
continue
|
|
219
|
+
field = await resource.get_field(field_id.key, field_id.pb_type)
|
|
220
|
+
|
|
221
|
+
if field_id not in self.field_paragraphs:
|
|
222
|
+
field_paragraphs_index = ParagraphIndex(field_id)
|
|
223
|
+
self.field_paragraphs[field_id] = field_paragraphs_index
|
|
224
|
+
field_paragraphs_index = self.field_paragraphs[field_id]
|
|
225
|
+
|
|
226
|
+
paragraph_tasks[paragraph_id] = asyncio.create_task(
|
|
227
|
+
self._limited_concurrency(
|
|
228
|
+
hydrate_paragraph(
|
|
229
|
+
resource, field, paragraph_id, self.config.paragraph, field_paragraphs_index
|
|
230
|
+
),
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if field_id not in field_tasks:
|
|
235
|
+
field_tasks[field_id] = asyncio.create_task(
|
|
236
|
+
self._limited_concurrency(hydrate_field(resource, field_id, self.config.field))
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
if rid not in resource_tasks:
|
|
240
|
+
if self.config.resource is not None:
|
|
241
|
+
resource_tasks[rid] = asyncio.create_task(
|
|
242
|
+
self._limited_concurrency(hydrate_resource(resource, rid, self.config.resource))
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
ops = [
|
|
246
|
+
*paragraph_tasks.values(),
|
|
247
|
+
*field_tasks.values(),
|
|
248
|
+
*resource_tasks.values(),
|
|
249
|
+
]
|
|
250
|
+
results = await asyncio.gather(*ops)
|
|
251
|
+
hydrated_paragraphs = results[: len(paragraph_tasks)]
|
|
252
|
+
hydrated_fields = results[len(paragraph_tasks) : len(paragraph_tasks) + len(field_tasks)]
|
|
253
|
+
hydrated_resources = results[
|
|
254
|
+
len(paragraph_tasks) + len(field_tasks) : len(paragraph_tasks)
|
|
255
|
+
+ len(field_tasks)
|
|
256
|
+
+ len(resource_tasks)
|
|
257
|
+
]
|
|
258
|
+
|
|
259
|
+
for rid, hydrated_resource in zip(resource_tasks.keys(), hydrated_resources):
|
|
260
|
+
self.hydrated.add_resource(rid, hydrated_resource)
|
|
261
|
+
|
|
262
|
+
for field_id, hydrated_field in zip(field_tasks.keys(), hydrated_fields):
|
|
263
|
+
if hydrated_field is not None:
|
|
264
|
+
self.hydrated.add_field(field_id, hydrated_field)
|
|
265
|
+
|
|
266
|
+
for paragraph_id, (hydrated_paragraph, extra) in zip(
|
|
267
|
+
paragraph_tasks.keys(), hydrated_paragraphs
|
|
268
|
+
):
|
|
269
|
+
self.hydrated.add_paragraph(paragraph_id, hydrated_paragraph)
|
|
270
|
+
|
|
271
|
+
for related_paragraph_id in extra.related_paragraph_ids:
|
|
272
|
+
field_id = related_paragraph_id.field_id
|
|
273
|
+
rid = related_paragraph_id.rid
|
|
274
|
+
|
|
275
|
+
resource = await cache.get_resource(self.kbid, rid)
|
|
276
|
+
if resource is None:
|
|
277
|
+
# skip resources that aren't in the DB
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
|
|
281
|
+
if not (await resource.field_exists(field_type_pb, field_id.key)):
|
|
282
|
+
# skip a fields that aren't in the DB
|
|
283
|
+
continue
|
|
284
|
+
field = await resource.get_field(field_id.key, field_id.pb_type)
|
|
285
|
+
|
|
286
|
+
if field_id not in self.field_paragraphs:
|
|
287
|
+
field_paragraphs_index = ParagraphIndex(field_id)
|
|
288
|
+
self.field_paragraphs[field_id] = field_paragraphs_index
|
|
289
|
+
field_paragraphs_index = self.field_paragraphs[field_id]
|
|
290
|
+
|
|
291
|
+
(hydrated_paragraph, _) = await hydrate_paragraph(
|
|
292
|
+
resource,
|
|
293
|
+
field,
|
|
294
|
+
related_paragraph_id,
|
|
295
|
+
ParagraphHydration(
|
|
296
|
+
text=self.config.paragraph.text, image=None, table=None, page=None, related=None
|
|
297
|
+
),
|
|
298
|
+
field_paragraphs_index,
|
|
299
|
+
)
|
|
300
|
+
self.hydrated.add_paragraph(related_paragraph_id, hydrated_paragraph)
|
|
301
|
+
|
|
302
|
+
if self.hydrated.has_field(field_id):
|
|
303
|
+
# we only hydrate page and table previews for fields the user
|
|
304
|
+
# allowed hydration, skipping fields with explicitly disabled
|
|
305
|
+
# hydration
|
|
306
|
+
|
|
307
|
+
if extra.field_page is not None:
|
|
308
|
+
page_number = extra.field_page
|
|
309
|
+
preview = await self.cached_download_page_preview(field, page_number)
|
|
310
|
+
if preview is not None:
|
|
311
|
+
self.hydrated.add_page_preview(paragraph_id, page_number, preview)
|
|
312
|
+
|
|
313
|
+
if extra.field_table_page is not None:
|
|
314
|
+
page_number = extra.field_table_page
|
|
315
|
+
preview = await self.cached_download_page_preview(field, page_number)
|
|
316
|
+
if preview is not None:
|
|
317
|
+
self.hydrated.add_table_page_preview(paragraph_id, page_number, preview)
|
|
318
|
+
|
|
319
|
+
return self.hydrated.build()
|
|
320
|
+
|
|
321
|
+
# TODO: proper typing
|
|
322
|
+
async def _limited_concurrency(self, aw: Awaitable):
|
|
323
|
+
async with self.max_ops:
|
|
324
|
+
return await aw
|
|
325
|
+
|
|
326
|
+
@alru_cache(maxsize=None)
|
|
327
|
+
async def cached_download_page_preview(self, field: Field, page: int) -> Optional[Image]:
|
|
328
|
+
return await download_page_preview(field, page)
|
|
@@ -28,7 +28,8 @@ from nucliadb.search.api.v1.resource.utils import get_resource_uuid_by_slug
|
|
|
28
28
|
from nucliadb.search.api.v1.router import KB_PREFIX, RESOURCE_SLUG_PREFIX, api
|
|
29
29
|
from nucliadb_models.resource import NucliaDBRoles
|
|
30
30
|
from nucliadb_models.search import AskRequest, NucliaDBClientType, SyncAskResponse
|
|
31
|
-
from
|
|
31
|
+
from nucliadb_models.security import RequestSecurity
|
|
32
|
+
from nucliadb_utils.authentication import NucliaUser, requires
|
|
32
33
|
|
|
33
34
|
from ..ask import create_ask_response
|
|
34
35
|
|
|
@@ -58,6 +59,15 @@ async def resource_ask_endpoint_by_uuid(
|
|
|
58
59
|
"This is slower and requires waiting for entire answer to be ready.",
|
|
59
60
|
),
|
|
60
61
|
) -> Union[StreamingResponse, HTTPClientError, Response]:
|
|
62
|
+
current_user: NucliaUser = request.user
|
|
63
|
+
# If present, security groups from AuthorizationBackend overrides any
|
|
64
|
+
# security group of the payload
|
|
65
|
+
if current_user.security_groups:
|
|
66
|
+
if item.security is None:
|
|
67
|
+
item.security = RequestSecurity(groups=current_user.security_groups)
|
|
68
|
+
else:
|
|
69
|
+
item.security.groups = current_user.security_groups
|
|
70
|
+
|
|
61
71
|
return await create_ask_response(
|
|
62
72
|
kbid=kbid,
|
|
63
73
|
ask_request=item,
|
|
@@ -98,6 +108,16 @@ async def resource_ask_endpoint_by_slug(
|
|
|
98
108
|
resource_id = await get_resource_uuid_by_slug(kbid, slug)
|
|
99
109
|
if resource_id is None:
|
|
100
110
|
return HTTPClientError(status_code=404, detail="Resource not found")
|
|
111
|
+
|
|
112
|
+
current_user: NucliaUser = request.user
|
|
113
|
+
# If present, security groups from AuthorizationBackend overrides any
|
|
114
|
+
# security group of the payload
|
|
115
|
+
if current_user.security_groups:
|
|
116
|
+
if item.security is None:
|
|
117
|
+
item.security = RequestSecurity(groups=current_user.security_groups)
|
|
118
|
+
else:
|
|
119
|
+
item.security.groups = current_user.security_groups
|
|
120
|
+
|
|
101
121
|
return await create_ask_response(
|
|
102
122
|
kbid=kbid,
|
|
103
123
|
ask_request=item,
|
nucliadb/search/api/v1/search.py
CHANGED
|
@@ -148,7 +148,6 @@ async def search_knowledgebox(
|
|
|
148
148
|
extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
|
|
149
149
|
with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
|
|
150
150
|
with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
|
|
151
|
-
autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
|
|
152
151
|
security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
|
|
153
152
|
show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
|
|
154
153
|
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
|
@@ -187,7 +186,6 @@ async def search_knowledgebox(
|
|
|
187
186
|
extracted=extracted,
|
|
188
187
|
with_duplicates=with_duplicates,
|
|
189
188
|
with_synonyms=with_synonyms,
|
|
190
|
-
autofilter=autofilter,
|
|
191
189
|
security=security,
|
|
192
190
|
show_hidden=show_hidden,
|
|
193
191
|
)
|
|
@@ -262,7 +260,7 @@ async def search(
|
|
|
262
260
|
start_time = time()
|
|
263
261
|
|
|
264
262
|
parsed = await parse_search(kbid, item)
|
|
265
|
-
pb_query, incomplete_results,
|
|
263
|
+
pb_query, incomplete_results, _ = await legacy_convert_retrieval_to_proto(parsed)
|
|
266
264
|
|
|
267
265
|
# We need to query all nodes
|
|
268
266
|
results, queried_shards = await nidx_query(kbid, Method.SEARCH, pb_query)
|
|
@@ -290,5 +288,4 @@ async def search(
|
|
|
290
288
|
)
|
|
291
289
|
|
|
292
290
|
search_results.shards = queried_shards
|
|
293
|
-
search_results.autofilters = autofilters
|
|
294
291
|
return search_results, incomplete_results
|
nucliadb/search/predict.py
CHANGED
|
@@ -447,6 +447,10 @@ class DummyPredictEngine(PredictEngine):
|
|
|
447
447
|
self.cluster_url = "http://localhost:8000"
|
|
448
448
|
self.public_url = "http://localhost:8000"
|
|
449
449
|
self.calls = []
|
|
450
|
+
self.ndjson_reasoning = [
|
|
451
|
+
b'{"chunk": {"type": "reasoning", "text": "dummy "}}\n',
|
|
452
|
+
b'{"chunk": {"type": "reasoning", "text": "reasoning"}}\n',
|
|
453
|
+
]
|
|
450
454
|
self.ndjson_answer = [
|
|
451
455
|
b'{"chunk": {"type": "text", "text": "valid "}}\n',
|
|
452
456
|
b'{"chunk": {"type": "text", "text": "answer "}}\n',
|
|
@@ -482,8 +486,11 @@ class DummyPredictEngine(PredictEngine):
|
|
|
482
486
|
self.calls.append(("chat_query_ndjson", item))
|
|
483
487
|
|
|
484
488
|
async def generate():
|
|
485
|
-
|
|
486
|
-
|
|
489
|
+
if item.reasoning is not False:
|
|
490
|
+
for chunk in self.ndjson_reasoning:
|
|
491
|
+
yield GenerativeChunk.model_validate_json(chunk)
|
|
492
|
+
for chunk in self.ndjson_answer:
|
|
493
|
+
yield GenerativeChunk.model_validate_json(chunk)
|
|
487
494
|
|
|
488
495
|
return (DUMMY_LEARNING_ID, DUMMY_LEARNING_MODEL, generate())
|
|
489
496
|
|
nucliadb/search/search/cache.py
CHANGED
|
@@ -21,8 +21,6 @@ import contextlib
|
|
|
21
21
|
import logging
|
|
22
22
|
from typing import Optional
|
|
23
23
|
|
|
24
|
-
import backoff
|
|
25
|
-
|
|
26
24
|
from nucliadb.common.cache import (
|
|
27
25
|
extracted_text_cache,
|
|
28
26
|
get_extracted_text_cache,
|
|
@@ -54,7 +52,7 @@ async def get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
|
|
|
54
52
|
|
|
55
53
|
|
|
56
54
|
async def _orm_get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
|
|
57
|
-
async with get_driver().
|
|
55
|
+
async with get_driver().ro_transaction() as txn:
|
|
58
56
|
storage = await get_storage(service_name=SERVICE_NAME)
|
|
59
57
|
kb = KnowledgeBoxORM(txn, storage, kbid)
|
|
60
58
|
return await kb.get(uuid)
|
|
@@ -74,23 +72,6 @@ async def get_field_extracted_text(field: Field) -> Optional[ExtractedText]:
|
|
|
74
72
|
return extracted_text
|
|
75
73
|
|
|
76
74
|
|
|
77
|
-
@backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
|
|
78
|
-
async def field_get_extracted_text(field: Field) -> Optional[ExtractedText]:
|
|
79
|
-
try:
|
|
80
|
-
return await field.get_extracted_text()
|
|
81
|
-
except Exception:
|
|
82
|
-
logger.warning(
|
|
83
|
-
"Error getting extracted text for field. Retrying",
|
|
84
|
-
exc_info=True,
|
|
85
|
-
extra={
|
|
86
|
-
"kbid": field.kbid,
|
|
87
|
-
"resource_id": field.resource.uuid,
|
|
88
|
-
"field": f"{field.type}/{field.id}",
|
|
89
|
-
},
|
|
90
|
-
)
|
|
91
|
-
raise
|
|
92
|
-
|
|
93
|
-
|
|
94
75
|
async def get_extracted_text_from_field_id(kbid: str, field: FieldId) -> Optional[ExtractedText]:
|
|
95
76
|
rid = field.rid
|
|
96
77
|
orm_resource = await get_resource(kbid, rid)
|