nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0028_extracted_vectors_reference.py +61 -0
- migrations/0029_backfill_field_status.py +149 -0
- migrations/0030_label_deduplication.py +60 -0
- nucliadb/common/cluster/manager.py +41 -331
- nucliadb/common/cluster/rebalance.py +2 -2
- nucliadb/common/cluster/rollover.py +12 -71
- nucliadb/common/cluster/settings.py +3 -0
- nucliadb/common/cluster/standalone/utils.py +0 -43
- nucliadb/common/cluster/utils.py +0 -16
- nucliadb/common/counters.py +1 -0
- nucliadb/common/datamanagers/fields.py +48 -7
- nucliadb/common/datamanagers/vectorsets.py +11 -2
- nucliadb/common/external_index_providers/base.py +2 -1
- nucliadb/common/external_index_providers/pinecone.py +3 -5
- nucliadb/common/ids.py +18 -4
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +76 -37
- nucliadb/export_import/models.py +3 -3
- nucliadb/health.py +0 -7
- nucliadb/ingest/app.py +0 -8
- nucliadb/ingest/consumer/auditing.py +1 -1
- nucliadb/ingest/consumer/shard_creator.py +1 -1
- nucliadb/ingest/fields/base.py +83 -21
- nucliadb/ingest/orm/brain.py +55 -56
- nucliadb/ingest/orm/broker_message.py +12 -2
- nucliadb/ingest/orm/entities.py +6 -17
- nucliadb/ingest/orm/knowledgebox.py +44 -22
- nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
- nucliadb/ingest/orm/processor/processor.py +5 -2
- nucliadb/ingest/orm/resource.py +222 -413
- nucliadb/ingest/processing.py +8 -2
- nucliadb/ingest/serialize.py +77 -46
- nucliadb/ingest/service/writer.py +2 -56
- nucliadb/ingest/settings.py +1 -4
- nucliadb/learning_proxy.py +6 -4
- nucliadb/purge/__init__.py +102 -12
- nucliadb/purge/orphan_shards.py +6 -4
- nucliadb/reader/api/models.py +3 -3
- nucliadb/reader/api/v1/__init__.py +1 -0
- nucliadb/reader/api/v1/download.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +3 -3
- nucliadb/reader/api/v1/resource.py +23 -12
- nucliadb/reader/api/v1/services.py +4 -4
- nucliadb/reader/api/v1/vectorsets.py +48 -0
- nucliadb/search/api/v1/ask.py +11 -1
- nucliadb/search/api/v1/feedback.py +3 -3
- nucliadb/search/api/v1/knowledgebox.py +8 -13
- nucliadb/search/api/v1/search.py +3 -2
- nucliadb/search/api/v1/suggest.py +0 -2
- nucliadb/search/predict.py +6 -4
- nucliadb/search/requesters/utils.py +1 -2
- nucliadb/search/search/chat/ask.py +77 -13
- nucliadb/search/search/chat/prompt.py +16 -5
- nucliadb/search/search/chat/query.py +74 -34
- nucliadb/search/search/exceptions.py +2 -7
- nucliadb/search/search/find.py +9 -5
- nucliadb/search/search/find_merge.py +10 -4
- nucliadb/search/search/graph_strategy.py +884 -0
- nucliadb/search/search/hydrator.py +6 -0
- nucliadb/search/search/merge.py +79 -24
- nucliadb/search/search/query.py +74 -245
- nucliadb/search/search/query_parser/exceptions.py +11 -1
- nucliadb/search/search/query_parser/fetcher.py +405 -0
- nucliadb/search/search/query_parser/models.py +0 -3
- nucliadb/search/search/query_parser/parser.py +22 -21
- nucliadb/search/search/rerankers.py +1 -42
- nucliadb/search/search/shards.py +19 -0
- nucliadb/standalone/api_router.py +2 -14
- nucliadb/standalone/settings.py +4 -0
- nucliadb/train/generators/field_streaming.py +7 -3
- nucliadb/train/lifecycle.py +3 -6
- nucliadb/train/nodes.py +14 -12
- nucliadb/train/resource.py +380 -0
- nucliadb/writer/api/constants.py +20 -16
- nucliadb/writer/api/v1/__init__.py +1 -0
- nucliadb/writer/api/v1/export_import.py +1 -1
- nucliadb/writer/api/v1/field.py +13 -7
- nucliadb/writer/api/v1/knowledgebox.py +3 -46
- nucliadb/writer/api/v1/resource.py +20 -13
- nucliadb/writer/api/v1/services.py +10 -1
- nucliadb/writer/api/v1/upload.py +61 -34
- nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
- nucliadb/writer/back_pressure.py +17 -46
- nucliadb/writer/resource/basic.py +9 -7
- nucliadb/writer/resource/field.py +42 -9
- nucliadb/writer/settings.py +2 -2
- nucliadb/writer/tus/gcs.py +11 -10
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
- nucliadb/common/cluster/discovery/base.py +0 -178
- nucliadb/common/cluster/discovery/k8s.py +0 -301
- nucliadb/common/cluster/discovery/manual.py +0 -57
- nucliadb/common/cluster/discovery/single.py +0 -51
- nucliadb/common/cluster/discovery/types.py +0 -32
- nucliadb/common/cluster/discovery/utils.py +0 -67
- nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
- nucliadb/common/cluster/standalone/index_node.py +0 -123
- nucliadb/common/cluster/standalone/service.py +0 -84
- nucliadb/standalone/introspect.py +0 -208
- nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
- /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
nucliadb/reader/api/models.py
CHANGED
@@ -22,7 +22,7 @@ from typing import TYPE_CHECKING, Any, Optional, Union
|
|
22
22
|
from pydantic import BaseModel
|
23
23
|
|
24
24
|
import nucliadb_models as models
|
25
|
-
from nucliadb_models.common import
|
25
|
+
from nucliadb_models.common import FieldTypeName
|
26
26
|
from nucliadb_models.resource import (
|
27
27
|
ConversationFieldExtractedData,
|
28
28
|
Error,
|
@@ -52,10 +52,10 @@ class ResourceField(BaseModel):
|
|
52
52
|
value: ValueType = None
|
53
53
|
extracted: Optional[ExtractedDataType] = None
|
54
54
|
error: Optional[Error] = None
|
55
|
+
status: Optional[str] = None
|
56
|
+
errors: Optional[list[Error]] = None
|
55
57
|
|
56
58
|
|
57
|
-
FIELD_NAMES_TO_PB_TYPE_MAP = {v: k for k, v in FIELD_TYPES_MAP.items()}
|
58
|
-
|
59
59
|
FIELD_NAME_TO_EXTRACTED_DATA_FIELD_MAP: dict[FieldTypeName, Any] = {
|
60
60
|
FieldTypeName.TEXT: TextFieldExtractedData,
|
61
61
|
FieldTypeName.FILE: FileFieldExtractedData,
|
@@ -29,9 +29,9 @@ from starlette.datastructures import Headers
|
|
29
29
|
from starlette.responses import StreamingResponse
|
30
30
|
|
31
31
|
from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR
|
32
|
+
from nucliadb.common.models_utils import to_proto
|
32
33
|
from nucliadb.ingest.serialize import get_resource_uuid_by_slug
|
33
34
|
from nucliadb.reader import SERVICE_NAME, logger
|
34
|
-
from nucliadb.reader.api.models import FIELD_NAMES_TO_PB_TYPE_MAP
|
35
35
|
from nucliadb_models.common import FieldTypeName
|
36
36
|
from nucliadb_models.resource import NucliaDBRoles
|
37
37
|
from nucliadb_utils.authentication import requires_one
|
@@ -97,7 +97,7 @@ async def _download_extract_file(
|
|
97
97
|
|
98
98
|
storage = await get_storage(service_name=SERVICE_NAME)
|
99
99
|
|
100
|
-
pb_field_type =
|
100
|
+
pb_field_type = to_proto.field_type_name(field_type)
|
101
101
|
field_type_letter = FIELD_TYPE_PB_TO_STR[pb_field_type]
|
102
102
|
|
103
103
|
sf = storage.file_extracted(kbid, rid, field_type_letter, field_id, download_field)
|
@@ -23,9 +23,9 @@ from starlette.requests import Request
|
|
23
23
|
|
24
24
|
from nucliadb.common import datamanagers
|
25
25
|
from nucliadb.common.maindb.utils import get_driver
|
26
|
+
from nucliadb.common.models_utils import from_proto
|
26
27
|
from nucliadb.reader.api.v1.router import KB_PREFIX, KBS_PREFIX, api
|
27
28
|
from nucliadb_models.resource import (
|
28
|
-
KnowledgeBoxConfig,
|
29
29
|
KnowledgeBoxList,
|
30
30
|
KnowledgeBoxObj,
|
31
31
|
KnowledgeBoxObjSummary,
|
@@ -72,7 +72,7 @@ async def get_kb(request: Request, kbid: str) -> KnowledgeBoxObj:
|
|
72
72
|
return KnowledgeBoxObj(
|
73
73
|
uuid=kbid,
|
74
74
|
slug=kb_config.slug,
|
75
|
-
config=
|
75
|
+
config=from_proto.knowledgebox_config(kb_config),
|
76
76
|
)
|
77
77
|
|
78
78
|
|
@@ -99,5 +99,5 @@ async def get_kb_by_slug(request: Request, slug: str) -> KnowledgeBoxObj:
|
|
99
99
|
return KnowledgeBoxObj(
|
100
100
|
uuid=kbid,
|
101
101
|
slug=kb_config.slug,
|
102
|
-
config=
|
102
|
+
config=from_proto.knowledgebox_config(kb_config),
|
103
103
|
)
|
@@ -22,9 +22,9 @@ from typing import Optional, Union
|
|
22
22
|
from fastapi import Header, HTTPException, Query, Request, Response
|
23
23
|
from fastapi_versioning import version
|
24
24
|
|
25
|
-
import nucliadb_models as models
|
26
25
|
from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG_BASE
|
27
26
|
from nucliadb.common.maindb.utils import get_driver
|
27
|
+
from nucliadb.common.models_utils import from_proto, to_proto
|
28
28
|
from nucliadb.ingest.fields.conversation import Conversation
|
29
29
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as ORMKnowledgeBox
|
30
30
|
from nucliadb.ingest.orm.resource import Resource as ORMResource
|
@@ -37,7 +37,6 @@ from nucliadb.reader import SERVICE_NAME
|
|
37
37
|
from nucliadb.reader.api import DEFAULT_RESOURCE_LIST_PAGE_SIZE
|
38
38
|
from nucliadb.reader.api.models import (
|
39
39
|
FIELD_NAME_TO_EXTRACTED_DATA_FIELD_MAP,
|
40
|
-
FIELD_NAMES_TO_PB_TYPE_MAP,
|
41
40
|
ResourceField,
|
42
41
|
)
|
43
42
|
from nucliadb.reader.api.v1.router import KB_PREFIX, RESOURCE_PREFIX, RSLUG_PREFIX, api
|
@@ -53,6 +52,7 @@ from nucliadb_models.resource import (
|
|
53
52
|
)
|
54
53
|
from nucliadb_models.search import ResourceProperties
|
55
54
|
from nucliadb_protos import resources_pb2
|
55
|
+
from nucliadb_protos.writer_pb2 import FieldStatus
|
56
56
|
from nucliadb_telemetry import errors
|
57
57
|
from nucliadb_utils.authentication import requires, requires_one
|
58
58
|
from nucliadb_utils.utilities import get_audit, get_storage
|
@@ -334,9 +334,7 @@ async def _get_resource_field(
|
|
334
334
|
) -> Response:
|
335
335
|
storage = await get_storage(service_name=SERVICE_NAME)
|
336
336
|
driver = get_driver()
|
337
|
-
|
338
|
-
pb_field_id = FIELD_NAMES_TO_PB_TYPE_MAP[field_type]
|
339
|
-
|
337
|
+
pb_field_id = to_proto.field_type_name(field_type)
|
340
338
|
async with driver.transaction() as txn:
|
341
339
|
kb = ORMKnowledgeBox(txn, storage, kbid)
|
342
340
|
|
@@ -358,15 +356,15 @@ async def _get_resource_field(
|
|
358
356
|
|
359
357
|
if isinstance(value, resources_pb2.FieldText):
|
360
358
|
value = await field.get_value()
|
361
|
-
resource_field.value =
|
359
|
+
resource_field.value = from_proto.field_text(value)
|
362
360
|
|
363
361
|
if isinstance(value, resources_pb2.FieldFile):
|
364
362
|
value = await field.get_value()
|
365
|
-
resource_field.value =
|
363
|
+
resource_field.value = from_proto.field_file(value)
|
366
364
|
|
367
365
|
if isinstance(value, resources_pb2.FieldLink):
|
368
366
|
value = await field.get_value()
|
369
|
-
resource_field.value =
|
367
|
+
resource_field.value = from_proto.field_link(value)
|
370
368
|
|
371
369
|
if isinstance(field, Conversation):
|
372
370
|
if page == "first":
|
@@ -379,7 +377,7 @@ async def _get_resource_field(
|
|
379
377
|
|
380
378
|
value = await field.get_value(page=page_to_fetch)
|
381
379
|
if value is not None:
|
382
|
-
resource_field.value =
|
380
|
+
resource_field.value = from_proto.conversation(value)
|
383
381
|
|
384
382
|
if ResourceFieldProperties.EXTRACTED in show and extracted:
|
385
383
|
resource_field.extracted = FIELD_NAME_TO_EXTRACTED_DATA_FIELD_MAP[field_type]()
|
@@ -391,9 +389,22 @@ async def _get_resource_field(
|
|
391
389
|
)
|
392
390
|
|
393
391
|
if ResourceFieldProperties.ERROR in show:
|
394
|
-
|
395
|
-
if
|
396
|
-
|
392
|
+
status = await field.get_status()
|
393
|
+
if status is None:
|
394
|
+
status = FieldStatus()
|
395
|
+
resource_field.status = status.Status.Name(status.status)
|
396
|
+
if status.errors:
|
397
|
+
resource_field.errors = []
|
398
|
+
for error in status.errors:
|
399
|
+
resource_field.errors.append(
|
400
|
+
Error(
|
401
|
+
body=error.source_error.error,
|
402
|
+
code=error.source_error.code,
|
403
|
+
code_str=error.source_error.ErrorCode.Name(error.source_error.code),
|
404
|
+
created=error.created.ToDatetime(),
|
405
|
+
)
|
406
|
+
)
|
407
|
+
resource_field.error = resource_field.errors[-1]
|
397
408
|
|
398
409
|
return Response(
|
399
410
|
content=resource_field.model_dump_json(exclude_unset=True, by_alias=True),
|
@@ -32,6 +32,7 @@ from nucliadb.common.context.fastapi import get_app_context
|
|
32
32
|
from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
|
33
33
|
from nucliadb.common.http_clients import processing
|
34
34
|
from nucliadb.common.maindb.utils import get_driver
|
35
|
+
from nucliadb.common.models_utils import from_proto
|
35
36
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
36
37
|
from nucliadb.models.responses import HTTPClientError
|
37
38
|
from nucliadb.reader import SERVICE_NAME
|
@@ -39,7 +40,6 @@ from nucliadb.reader.api.v1.router import KB_PREFIX, api
|
|
39
40
|
from nucliadb.reader.reader.notifications import kb_notifications_stream
|
40
41
|
from nucliadb_models.entities import (
|
41
42
|
EntitiesGroup,
|
42
|
-
EntitiesGroupSummary,
|
43
43
|
KnowledgeBoxEntities,
|
44
44
|
)
|
45
45
|
from nucliadb_models.labels import KnowledgeBoxLabels, LabelSet
|
@@ -86,7 +86,7 @@ async def list_entities_groups(kbid: str):
|
|
86
86
|
if entities_groups.status == ListEntitiesGroupsResponse.Status.OK:
|
87
87
|
response = KnowledgeBoxEntities(uuid=kbid)
|
88
88
|
for key, eg_summary in entities_groups.groups.items():
|
89
|
-
entities_group =
|
89
|
+
entities_group = from_proto.entities_group_summary(eg_summary)
|
90
90
|
response.groups[key] = entities_group
|
91
91
|
return response
|
92
92
|
elif entities_groups.status == ListEntitiesGroupsResponse.Status.NOTFOUND:
|
@@ -114,7 +114,7 @@ async def get_entity(request: Request, kbid: str, group: str) -> EntitiesGroup:
|
|
114
114
|
|
115
115
|
kbobj: GetEntitiesGroupResponse = await ingest.GetEntitiesGroup(l_request) # type: ignore
|
116
116
|
if kbobj.status == GetEntitiesGroupResponse.Status.OK:
|
117
|
-
response =
|
117
|
+
response = from_proto.entities_group(kbobj.group)
|
118
118
|
return response
|
119
119
|
elif kbobj.status == GetEntitiesGroupResponse.Status.KB_NOT_FOUND:
|
120
120
|
raise HTTPException(status_code=404, detail=f"Knowledge Box '{kbid}' does not exist")
|
@@ -208,7 +208,7 @@ async def get_custom_synonyms(request: Request, kbid: str):
|
|
208
208
|
if not await datamanagers.atomic.kb.exists_kb(kbid=kbid):
|
209
209
|
raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
|
210
210
|
synonyms = await datamanagers.atomic.synonyms.get(kbid=kbid) or Synonyms()
|
211
|
-
return
|
211
|
+
return from_proto.kb_synonyms(synonyms)
|
212
212
|
|
213
213
|
|
214
214
|
@api.get(
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
from fastapi_versioning import version
|
21
|
+
from starlette.requests import Request
|
22
|
+
|
23
|
+
from nucliadb.common import datamanagers
|
24
|
+
from nucliadb.reader.api.v1.router import KB_PREFIX, api
|
25
|
+
from nucliadb_models.resource import (
|
26
|
+
NucliaDBRoles,
|
27
|
+
)
|
28
|
+
from nucliadb_models.vectorsets import VectorSetList, VectorSetListItem
|
29
|
+
from nucliadb_utils.authentication import requires_one
|
30
|
+
|
31
|
+
|
32
|
+
@api.get(
|
33
|
+
f"/{KB_PREFIX}/{{kbid}}/vectorsets",
|
34
|
+
status_code=200,
|
35
|
+
summary="List vector sets",
|
36
|
+
response_model=VectorSetList,
|
37
|
+
tags=["Vector Sets"],
|
38
|
+
# TODO: remove when the feature is mature
|
39
|
+
include_in_schema=False,
|
40
|
+
)
|
41
|
+
@requires_one([NucliaDBRoles.READER])
|
42
|
+
@version(1)
|
43
|
+
async def list_vectorsets(request: Request, kbid: str) -> VectorSetList:
|
44
|
+
vectorsets = []
|
45
|
+
async with datamanagers.with_ro_transaction() as txn:
|
46
|
+
async for vid, _ in datamanagers.vectorsets.iter(txn, kbid=kbid):
|
47
|
+
vectorsets.append(VectorSetListItem(id=vid))
|
48
|
+
return VectorSetList(vectorsets=vectorsets)
|
nucliadb/search/api/v1/ask.py
CHANGED
@@ -36,7 +36,8 @@ from nucliadb_models.search import (
|
|
36
36
|
SyncAskResponse,
|
37
37
|
parse_max_tokens,
|
38
38
|
)
|
39
|
-
from
|
39
|
+
from nucliadb_models.security import RequestSecurity
|
40
|
+
from nucliadb_utils.authentication import NucliaUser, requires
|
40
41
|
|
41
42
|
|
42
43
|
@api.post(
|
@@ -62,6 +63,15 @@ async def ask_knowledgebox_endpoint(
|
|
62
63
|
"This is slower and requires waiting for entire answer to be ready.",
|
63
64
|
),
|
64
65
|
) -> Union[StreamingResponse, HTTPClientError, Response]:
|
66
|
+
current_user: NucliaUser = request.user
|
67
|
+
# If present, security groups from AuthorizationBackend overrides any
|
68
|
+
# security group of the payload
|
69
|
+
if current_user.security_groups:
|
70
|
+
if item.security is None:
|
71
|
+
item.security = RequestSecurity(groups=current_user.security_groups)
|
72
|
+
else:
|
73
|
+
item.security.groups = current_user.security_groups
|
74
|
+
|
65
75
|
return await create_ask_response(
|
66
76
|
kbid, item, x_nucliadb_user, x_ndb_client, x_forwarded_for, x_synchronous
|
67
77
|
)
|
@@ -18,10 +18,10 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
|
21
|
-
|
22
21
|
from fastapi import Header, Request, Response
|
23
22
|
from fastapi_versioning import version
|
24
23
|
|
24
|
+
from nucliadb.common.models_utils import to_proto
|
25
25
|
from nucliadb.models.responses import HTTPClientError
|
26
26
|
from nucliadb.search import logger
|
27
27
|
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
@@ -56,11 +56,11 @@ async def send_feedback_endpoint(
|
|
56
56
|
audit.feedback(
|
57
57
|
kbid=kbid,
|
58
58
|
user=x_nucliadb_user,
|
59
|
-
client_type=
|
59
|
+
client_type=to_proto.client_type(x_ndb_client),
|
60
60
|
origin=x_forwarded_for,
|
61
61
|
learning_id=item.ident,
|
62
62
|
good=item.good,
|
63
|
-
task=item.task
|
63
|
+
task=to_proto.feedback_task(item.task),
|
64
64
|
feedback=item.feedback,
|
65
65
|
text_block_id=item.text_block_id,
|
66
66
|
)
|
@@ -32,6 +32,7 @@ from nucliadb.common.cluster.utils import get_shard_manager
|
|
32
32
|
from nucliadb.common.constants import AVG_PARAGRAPH_SIZE_BYTES
|
33
33
|
from nucliadb.common.counters import IndexCounts
|
34
34
|
from nucliadb.common.external_index_providers.manager import get_external_index_manager
|
35
|
+
from nucliadb.common.models_utils import from_proto
|
35
36
|
from nucliadb.search import logger
|
36
37
|
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
37
38
|
from nucliadb.search.api.v1.utils import fastapi_query
|
@@ -47,9 +48,7 @@ from nucliadb_protos.noderesources_pb2 import Shard
|
|
47
48
|
from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
|
48
49
|
from nucliadb_protos.writer_pb2 import Shards
|
49
50
|
from nucliadb_telemetry import errors
|
50
|
-
from nucliadb_utils import const
|
51
51
|
from nucliadb_utils.authentication import requires, requires_one
|
52
|
-
from nucliadb_utils.utilities import has_feature
|
53
52
|
|
54
53
|
MAX_PARAGRAPHS_FOR_SMALL_KB = 250_000
|
55
54
|
|
@@ -73,7 +72,7 @@ async def knowledgebox_shards(request: Request, kbid: str) -> KnowledgeboxShards
|
|
73
72
|
status_code=404,
|
74
73
|
detail="The knowledgebox or its shards configuration is missing",
|
75
74
|
)
|
76
|
-
return
|
75
|
+
return from_proto.kb_shards(shards)
|
77
76
|
|
78
77
|
|
79
78
|
@api.get(
|
@@ -124,8 +123,9 @@ async def _kb_counters(
|
|
124
123
|
counters.sentences = index_counts.sentences
|
125
124
|
is_small_kb = index_counts.paragraphs < MAX_PARAGRAPHS_FOR_SMALL_KB
|
126
125
|
resource_count = await get_resources_count(kbid, force_calculate=is_small_kb)
|
127
|
-
# TODO: Find a way to query the fields count from the external index provider or use the catalog
|
126
|
+
# TODO: Find a way to query the fields count and size from the external index provider or use the catalog
|
128
127
|
counters.resources = counters.fields = resource_count
|
128
|
+
counters.index_size = counters.paragraphs * AVG_PARAGRAPH_SIZE_BYTES
|
129
129
|
else:
|
130
130
|
node_index_counts, queried_shards = await get_node_index_counts(kbid)
|
131
131
|
counters.fields = node_index_counts.fields
|
@@ -134,7 +134,7 @@ async def _kb_counters(
|
|
134
134
|
is_small_kb = node_index_counts.paragraphs < MAX_PARAGRAPHS_FOR_SMALL_KB
|
135
135
|
resource_count = await get_resources_count(kbid, force_calculate=is_small_kb)
|
136
136
|
counters.resources = resource_count
|
137
|
-
|
137
|
+
counters.index_size = node_index_counts.size_bytes
|
138
138
|
if debug and queried_shards is not None:
|
139
139
|
counters.shards = queried_shards
|
140
140
|
return counters
|
@@ -165,9 +165,7 @@ async def get_node_index_counts(kbid: str) -> tuple[IndexCounts, list[str]]:
|
|
165
165
|
queried_shards = []
|
166
166
|
for shard_object in shard_groups:
|
167
167
|
try:
|
168
|
-
node, shard_id = choose_node(
|
169
|
-
shard_object, use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid})
|
170
|
-
)
|
168
|
+
node, shard_id = choose_node(shard_object)
|
171
169
|
except KeyError:
|
172
170
|
raise HTTPException(
|
173
171
|
status_code=500,
|
@@ -205,11 +203,7 @@ async def get_node_index_counts(kbid: str) -> tuple[IndexCounts, list[str]]:
|
|
205
203
|
if results is None:
|
206
204
|
raise HTTPException(status_code=503, detail=f"No shards found")
|
207
205
|
|
208
|
-
counts = IndexCounts(
|
209
|
-
fields=0,
|
210
|
-
paragraphs=0,
|
211
|
-
sentences=0,
|
212
|
-
)
|
206
|
+
counts = IndexCounts(fields=0, paragraphs=0, sentences=0, size_bytes=0)
|
213
207
|
for shard in results:
|
214
208
|
if isinstance(shard, Exception):
|
215
209
|
logger.error("Error getting shard info", exc_info=shard)
|
@@ -218,4 +212,5 @@ async def get_node_index_counts(kbid: str) -> tuple[IndexCounts, list[str]]:
|
|
218
212
|
counts.fields += shard.fields
|
219
213
|
counts.paragraphs += shard.paragraphs
|
220
214
|
counts.sentences += shard.sentences
|
215
|
+
counts.size_bytes += shard.size_bytes
|
221
216
|
return counts, queried_shards
|
nucliadb/search/api/v1/search.py
CHANGED
@@ -27,6 +27,7 @@ from fastapi_versioning import version
|
|
27
27
|
from pydantic import ValidationError
|
28
28
|
|
29
29
|
from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
|
30
|
+
from nucliadb.common.models_utils import to_proto
|
30
31
|
from nucliadb.models.responses import HTTPClientError
|
31
32
|
from nucliadb.search import predict
|
32
33
|
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
@@ -292,7 +293,7 @@ async def search(
|
|
292
293
|
hidden=await filter_hidden_resources(kbid, item.show_hidden),
|
293
294
|
rephrase_prompt=item.rephrase_prompt,
|
294
295
|
)
|
295
|
-
pb_query, incomplete_results, autofilters = await query_parser.parse()
|
296
|
+
pb_query, incomplete_results, autofilters, _ = await query_parser.parse()
|
296
297
|
|
297
298
|
results, query_incomplete_results, queried_nodes = await node_query(
|
298
299
|
kbid, Method.SEARCH, pb_query, target_shard_replicas=item.shards
|
@@ -318,7 +319,7 @@ async def search(
|
|
318
319
|
audit.search(
|
319
320
|
kbid,
|
320
321
|
x_nucliadb_user,
|
321
|
-
|
322
|
+
to_proto.client_type(x_ndb_client),
|
322
323
|
x_forwarded_for,
|
323
324
|
pb_query,
|
324
325
|
time() - start_time,
|
nucliadb/search/predict.py
CHANGED
@@ -21,7 +21,7 @@ import json
|
|
21
21
|
import os
|
22
22
|
import random
|
23
23
|
from enum import Enum
|
24
|
-
from typing import Any,
|
24
|
+
from typing import Any, AsyncGenerator, Optional
|
25
25
|
from unittest.mock import AsyncMock, Mock
|
26
26
|
|
27
27
|
import aiohttp
|
@@ -121,12 +121,14 @@ class AnswerStatusCode(str, Enum):
|
|
121
121
|
SUCCESS = "0"
|
122
122
|
ERROR = "-1"
|
123
123
|
NO_CONTEXT = "-2"
|
124
|
+
NO_RETRIEVAL_DATA = "-3"
|
124
125
|
|
125
126
|
def prettify(self) -> str:
|
126
127
|
return {
|
127
128
|
AnswerStatusCode.SUCCESS: "success",
|
128
129
|
AnswerStatusCode.ERROR: "error",
|
129
130
|
AnswerStatusCode.NO_CONTEXT: "no_context",
|
131
|
+
AnswerStatusCode.NO_RETRIEVAL_DATA: "no_retrieval_data",
|
130
132
|
}[self]
|
131
133
|
|
132
134
|
|
@@ -266,7 +268,7 @@ class PredictEngine:
|
|
266
268
|
@predict_observer.wrap({"type": "chat_ndjson"})
|
267
269
|
async def chat_query_ndjson(
|
268
270
|
self, kbid: str, item: ChatModel
|
269
|
-
) -> tuple[str, str,
|
271
|
+
) -> tuple[str, str, AsyncGenerator[GenerativeChunk, None]]:
|
270
272
|
"""
|
271
273
|
Chat query using the new stream format
|
272
274
|
Format specs: https://github.com/ndjson/ndjson-spec
|
@@ -442,7 +444,7 @@ class DummyPredictEngine(PredictEngine):
|
|
442
444
|
|
443
445
|
async def chat_query_ndjson(
|
444
446
|
self, kbid: str, item: ChatModel
|
445
|
-
) -> tuple[str, str,
|
447
|
+
) -> tuple[str, str, AsyncGenerator[GenerativeChunk, None]]:
|
446
448
|
self.calls.append(("chat_query_ndjson", item))
|
447
449
|
|
448
450
|
async def generate():
|
@@ -553,7 +555,7 @@ def get_answer_generator(response: aiohttp.ClientResponse):
|
|
553
555
|
|
554
556
|
def get_chat_ndjson_generator(
|
555
557
|
response: aiohttp.ClientResponse,
|
556
|
-
) ->
|
558
|
+
) -> AsyncGenerator[GenerativeChunk, None]:
|
557
559
|
async def _parse_generative_chunks(gen):
|
558
560
|
async for chunk in gen:
|
559
561
|
try:
|
@@ -123,7 +123,6 @@ async def node_query(
|
|
123
123
|
try:
|
124
124
|
node, shard_id = cluster_manager.choose_node(
|
125
125
|
shard_obj,
|
126
|
-
use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid}),
|
127
126
|
use_read_replica_nodes=use_read_replica_nodes,
|
128
127
|
target_shard_replicas=target_shard_replicas,
|
129
128
|
)
|
@@ -224,7 +223,7 @@ def validate_node_query_results(results: list[Any]) -> Optional[HTTPException]:
|
|
224
223
|
)
|
225
224
|
else:
|
226
225
|
errors.capture_exception(result)
|
227
|
-
logger.exception("Error while querying shard data", exc_info=result)
|
226
|
+
logger.exception(f"Error while querying shard data {result}", exc_info=result)
|
228
227
|
|
229
228
|
return HTTPException(status_code=status_code, detail=reason)
|
230
229
|
|