nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0028_extracted_vectors_reference.py +61 -0
- migrations/0029_backfill_field_status.py +149 -0
- migrations/0030_label_deduplication.py +60 -0
- nucliadb/common/cluster/manager.py +41 -331
- nucliadb/common/cluster/rebalance.py +2 -2
- nucliadb/common/cluster/rollover.py +12 -71
- nucliadb/common/cluster/settings.py +3 -0
- nucliadb/common/cluster/standalone/utils.py +0 -43
- nucliadb/common/cluster/utils.py +0 -16
- nucliadb/common/counters.py +1 -0
- nucliadb/common/datamanagers/fields.py +48 -7
- nucliadb/common/datamanagers/vectorsets.py +11 -2
- nucliadb/common/external_index_providers/base.py +2 -1
- nucliadb/common/external_index_providers/pinecone.py +3 -5
- nucliadb/common/ids.py +18 -4
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +76 -37
- nucliadb/export_import/models.py +3 -3
- nucliadb/health.py +0 -7
- nucliadb/ingest/app.py +0 -8
- nucliadb/ingest/consumer/auditing.py +1 -1
- nucliadb/ingest/consumer/shard_creator.py +1 -1
- nucliadb/ingest/fields/base.py +83 -21
- nucliadb/ingest/orm/brain.py +55 -56
- nucliadb/ingest/orm/broker_message.py +12 -2
- nucliadb/ingest/orm/entities.py +6 -17
- nucliadb/ingest/orm/knowledgebox.py +44 -22
- nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
- nucliadb/ingest/orm/processor/processor.py +5 -2
- nucliadb/ingest/orm/resource.py +222 -413
- nucliadb/ingest/processing.py +8 -2
- nucliadb/ingest/serialize.py +77 -46
- nucliadb/ingest/service/writer.py +2 -56
- nucliadb/ingest/settings.py +1 -4
- nucliadb/learning_proxy.py +6 -4
- nucliadb/purge/__init__.py +102 -12
- nucliadb/purge/orphan_shards.py +6 -4
- nucliadb/reader/api/models.py +3 -3
- nucliadb/reader/api/v1/__init__.py +1 -0
- nucliadb/reader/api/v1/download.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +3 -3
- nucliadb/reader/api/v1/resource.py +23 -12
- nucliadb/reader/api/v1/services.py +4 -4
- nucliadb/reader/api/v1/vectorsets.py +48 -0
- nucliadb/search/api/v1/ask.py +11 -1
- nucliadb/search/api/v1/feedback.py +3 -3
- nucliadb/search/api/v1/knowledgebox.py +8 -13
- nucliadb/search/api/v1/search.py +3 -2
- nucliadb/search/api/v1/suggest.py +0 -2
- nucliadb/search/predict.py +6 -4
- nucliadb/search/requesters/utils.py +1 -2
- nucliadb/search/search/chat/ask.py +77 -13
- nucliadb/search/search/chat/prompt.py +16 -5
- nucliadb/search/search/chat/query.py +74 -34
- nucliadb/search/search/exceptions.py +2 -7
- nucliadb/search/search/find.py +9 -5
- nucliadb/search/search/find_merge.py +10 -4
- nucliadb/search/search/graph_strategy.py +884 -0
- nucliadb/search/search/hydrator.py +6 -0
- nucliadb/search/search/merge.py +79 -24
- nucliadb/search/search/query.py +74 -245
- nucliadb/search/search/query_parser/exceptions.py +11 -1
- nucliadb/search/search/query_parser/fetcher.py +405 -0
- nucliadb/search/search/query_parser/models.py +0 -3
- nucliadb/search/search/query_parser/parser.py +22 -21
- nucliadb/search/search/rerankers.py +1 -42
- nucliadb/search/search/shards.py +19 -0
- nucliadb/standalone/api_router.py +2 -14
- nucliadb/standalone/settings.py +4 -0
- nucliadb/train/generators/field_streaming.py +7 -3
- nucliadb/train/lifecycle.py +3 -6
- nucliadb/train/nodes.py +14 -12
- nucliadb/train/resource.py +380 -0
- nucliadb/writer/api/constants.py +20 -16
- nucliadb/writer/api/v1/__init__.py +1 -0
- nucliadb/writer/api/v1/export_import.py +1 -1
- nucliadb/writer/api/v1/field.py +13 -7
- nucliadb/writer/api/v1/knowledgebox.py +3 -46
- nucliadb/writer/api/v1/resource.py +20 -13
- nucliadb/writer/api/v1/services.py +10 -1
- nucliadb/writer/api/v1/upload.py +61 -34
- nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
- nucliadb/writer/back_pressure.py +17 -46
- nucliadb/writer/resource/basic.py +9 -7
- nucliadb/writer/resource/field.py +42 -9
- nucliadb/writer/settings.py +2 -2
- nucliadb/writer/tus/gcs.py +11 -10
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
- nucliadb/common/cluster/discovery/base.py +0 -178
- nucliadb/common/cluster/discovery/k8s.py +0 -301
- nucliadb/common/cluster/discovery/manual.py +0 -57
- nucliadb/common/cluster/discovery/single.py +0 -51
- nucliadb/common/cluster/discovery/types.py +0 -32
- nucliadb/common/cluster/discovery/utils.py +0 -67
- nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
- nucliadb/common/cluster/standalone/index_node.py +0 -123
- nucliadb/common/cluster/standalone/service.py +0 -84
- nucliadb/standalone/introspect.py +0 -208
- nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
- /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -20,7 +20,7 @@
|
|
20
20
|
import asyncio
|
21
21
|
from functools import partial
|
22
22
|
|
23
|
-
from fastapi import HTTPException
|
23
|
+
from fastapi import HTTPException
|
24
24
|
from fastapi_versioning import version
|
25
25
|
from starlette.requests import Request
|
26
26
|
|
@@ -32,7 +32,7 @@ from nucliadb.common.external_index_providers.exceptions import (
|
|
32
32
|
from nucliadb.common.maindb.utils import get_driver
|
33
33
|
from nucliadb.ingest.orm.exceptions import KnowledgeBoxConflict
|
34
34
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
35
|
-
from nucliadb.writer import logger
|
35
|
+
from nucliadb.writer import logger
|
36
36
|
from nucliadb.writer.api.utils import only_for_onprem
|
37
37
|
from nucliadb.writer.api.v1.router import KB_PREFIX, KBS_PREFIX, api
|
38
38
|
from nucliadb.writer.utilities import get_processing
|
@@ -68,6 +68,7 @@ async def create_kb_endpoint(request: Request, item: KnowledgeBoxConfig) -> Know
|
|
68
68
|
except ExternalIndexCreationError as exc:
|
69
69
|
raise HTTPException(status_code=502, detail=str(exc))
|
70
70
|
except Exception:
|
71
|
+
logger.exception("Could not create KB")
|
71
72
|
raise HTTPException(status_code=500, detail="Error creating knowledge box")
|
72
73
|
else:
|
73
74
|
return KnowledgeBoxObj(uuid=kbid, slug=slug)
|
@@ -247,47 +248,3 @@ def to_pinecone_serverless_cloud_pb(
|
|
247
248
|
PineconeServerlessCloud.AZURE_EASTUS2: knowledgebox_pb2.PineconeServerlessCloud.AZURE_EASTUS2,
|
248
249
|
PineconeServerlessCloud.GCP_US_CENTRAL1: knowledgebox_pb2.PineconeServerlessCloud.GCP_US_CENTRAL1,
|
249
250
|
}[serverless]
|
250
|
-
|
251
|
-
|
252
|
-
@api.post(
|
253
|
-
f"/{KB_PREFIX}/{{kbid}}/vectorsets/{{vectorset_id}}",
|
254
|
-
status_code=200,
|
255
|
-
summary="Add a vectorset to Knowledge Box",
|
256
|
-
tags=["Knowledge Boxes"],
|
257
|
-
# TODO: remove when the feature is mature
|
258
|
-
include_in_schema=False,
|
259
|
-
)
|
260
|
-
@requires(NucliaDBRoles.MANAGER)
|
261
|
-
@version(1)
|
262
|
-
async def add_vectorset(request: Request, kbid: str, vectorset_id: str) -> Response:
|
263
|
-
try:
|
264
|
-
await vectorsets.add(kbid, vectorset_id)
|
265
|
-
except learning_proxy.ProxiedLearningConfigError as err:
|
266
|
-
return Response(
|
267
|
-
status_code=err.status_code,
|
268
|
-
content=err.content,
|
269
|
-
media_type=err.content_type,
|
270
|
-
)
|
271
|
-
return Response(status_code=200)
|
272
|
-
|
273
|
-
|
274
|
-
@api.delete(
|
275
|
-
f"/{KB_PREFIX}/{{kbid}}/vectorsets/{{vectorset_id}}",
|
276
|
-
status_code=200,
|
277
|
-
summary="Delete vectorset from Knowledge Box",
|
278
|
-
tags=["Knowledge Boxes"],
|
279
|
-
# TODO: remove when the feature is mature
|
280
|
-
include_in_schema=False,
|
281
|
-
)
|
282
|
-
@requires(NucliaDBRoles.MANAGER)
|
283
|
-
@version(1)
|
284
|
-
async def delete_vectorset(request: Request, kbid: str, vectorset_id: str) -> Response:
|
285
|
-
try:
|
286
|
-
await vectorsets.delete(kbid, vectorset_id)
|
287
|
-
except learning_proxy.ProxiedLearningConfigError as err:
|
288
|
-
return Response(
|
289
|
-
status_code=err.status_code,
|
290
|
-
content=err.content,
|
291
|
-
media_type=err.content_type,
|
292
|
-
)
|
293
|
-
return Response(status_code=200)
|
@@ -20,7 +20,7 @@
|
|
20
20
|
import asyncio
|
21
21
|
import contextlib
|
22
22
|
from time import time
|
23
|
-
from typing import Optional
|
23
|
+
from typing import Annotated, Optional
|
24
24
|
from uuid import uuid4
|
25
25
|
|
26
26
|
from fastapi import HTTPException, Query, Response
|
@@ -35,7 +35,7 @@ from nucliadb.common.maindb.utils import get_driver
|
|
35
35
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
36
36
|
from nucliadb.ingest.processing import ProcessingInfo, PushPayload, Source
|
37
37
|
from nucliadb.writer import SERVICE_NAME, logger
|
38
|
-
from nucliadb.writer.api.constants import
|
38
|
+
from nucliadb.writer.api.constants import X_NUCLIADB_USER, X_SKIP_STORE
|
39
39
|
from nucliadb.writer.api.v1 import transaction
|
40
40
|
from nucliadb.writer.api.v1.router import (
|
41
41
|
KB_PREFIX,
|
@@ -63,8 +63,8 @@ from nucliadb_models.writer import (
|
|
63
63
|
ResourceUpdated,
|
64
64
|
UpdateResourcePayload,
|
65
65
|
)
|
66
|
-
from nucliadb_protos.resources_pb2 import Metadata
|
67
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage, IndexResource
|
66
|
+
from nucliadb_protos.resources_pb2 import FieldID, Metadata
|
67
|
+
from nucliadb_protos.writer_pb2 import BrokerMessage, FieldIDStatus, FieldStatus, IndexResource
|
68
68
|
from nucliadb_telemetry.errors import capture_exception
|
69
69
|
from nucliadb_utils.authentication import requires
|
70
70
|
from nucliadb_utils.exceptions import LimitsExceededError, SendToProcessError
|
@@ -90,8 +90,8 @@ async def create_resource(
|
|
90
90
|
request: Request,
|
91
91
|
item: CreateResourcePayload,
|
92
92
|
kbid: str,
|
93
|
-
x_skip_store: bool =
|
94
|
-
x_nucliadb_user: str =
|
93
|
+
x_skip_store: Annotated[bool, X_SKIP_STORE] = False,
|
94
|
+
x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
|
95
95
|
):
|
96
96
|
kb_config = await datamanagers.atomic.kb.get_config(kbid=kbid)
|
97
97
|
if item.hidden and not (kb_config and kb_config.hidden_resources_enabled):
|
@@ -180,8 +180,8 @@ async def modify_resource_rslug_prefix(
|
|
180
180
|
kbid: str,
|
181
181
|
rslug: str,
|
182
182
|
item: UpdateResourcePayload,
|
183
|
-
x_skip_store: bool =
|
184
|
-
x_nucliadb_user: str =
|
183
|
+
x_skip_store: Annotated[bool, X_SKIP_STORE] = False,
|
184
|
+
x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
|
185
185
|
):
|
186
186
|
rid = await get_rid_from_slug_or_raise_error(kbid, rslug)
|
187
187
|
return await modify_resource_endpoint(
|
@@ -208,8 +208,8 @@ async def modify_resource_rid_prefix(
|
|
208
208
|
kbid: str,
|
209
209
|
rid: str,
|
210
210
|
item: UpdateResourcePayload,
|
211
|
-
|
212
|
-
|
211
|
+
x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
|
212
|
+
x_skip_store: Annotated[bool, X_SKIP_STORE] = False,
|
213
213
|
):
|
214
214
|
return await modify_resource_endpoint(
|
215
215
|
request,
|
@@ -371,7 +371,7 @@ async def reprocess_resource_rslug_prefix(
|
|
371
371
|
request: Request,
|
372
372
|
kbid: str,
|
373
373
|
rslug: str,
|
374
|
-
x_nucliadb_user: str =
|
374
|
+
x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
|
375
375
|
):
|
376
376
|
rid = await get_rid_from_slug_or_raise_error(kbid, rslug)
|
377
377
|
return await _reprocess_resource(request, kbid, rid, x_nucliadb_user=x_nucliadb_user)
|
@@ -390,7 +390,7 @@ async def reprocess_resource_rid_prefix(
|
|
390
390
|
request: Request,
|
391
391
|
kbid: str,
|
392
392
|
rid: str,
|
393
|
-
x_nucliadb_user: str =
|
393
|
+
x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
|
394
394
|
):
|
395
395
|
return await _reprocess_resource(request, kbid, rid, x_nucliadb_user=x_nucliadb_user)
|
396
396
|
|
@@ -422,6 +422,7 @@ async def _reprocess_resource(
|
|
422
422
|
storage = await get_storage(service_name=SERVICE_NAME)
|
423
423
|
driver = get_driver()
|
424
424
|
|
425
|
+
writer = BrokerMessage()
|
425
426
|
async with driver.transaction() as txn:
|
426
427
|
kb = KnowledgeBox(txn, storage, kbid)
|
427
428
|
|
@@ -430,8 +431,14 @@ async def _reprocess_resource(
|
|
430
431
|
raise HTTPException(status_code=404, detail="Resource does not exist")
|
431
432
|
|
432
433
|
await extract_fields(resource=resource, toprocess=toprocess)
|
434
|
+
for field_type, field_id in resource.fields.keys():
|
435
|
+
writer.field_statuses.append(
|
436
|
+
FieldIDStatus(
|
437
|
+
id=FieldID(field_type=field_type, field=field_id),
|
438
|
+
status=FieldStatus.Status.PENDING,
|
439
|
+
)
|
440
|
+
)
|
433
441
|
|
434
|
-
writer = BrokerMessage()
|
435
442
|
writer.kbid = kbid
|
436
443
|
writer.uuid = rid
|
437
444
|
writer.source = BrokerMessage.MessageSource.WRITER
|
@@ -23,6 +23,7 @@ from starlette.requests import Request
|
|
23
23
|
|
24
24
|
from nucliadb.common import datamanagers
|
25
25
|
from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
|
26
|
+
from nucliadb.common.models_utils import to_proto
|
26
27
|
from nucliadb.models.responses import (
|
27
28
|
HTTPConflict,
|
28
29
|
HTTPInternalServerError,
|
@@ -173,7 +174,15 @@ async def delete_entities(request: Request, kbid: str, group: str):
|
|
173
174
|
@requires(NucliaDBRoles.WRITER)
|
174
175
|
@version(1)
|
175
176
|
async def set_labelset_endpoint(request: Request, kbid: str, labelset: str, item: LabelSet):
|
177
|
+
if item.title is None:
|
178
|
+
item.title = labelset
|
179
|
+
|
176
180
|
try:
|
181
|
+
labelsets = await datamanagers.atomic.labelset.get_all(kbid=kbid)
|
182
|
+
labelset_titles = [ls.title.lower() for (k, ls) in labelsets.labelset.items() if k != labelset]
|
183
|
+
if item.title.lower() in labelset_titles:
|
184
|
+
raise HTTPException(status_code=422, detail="Duplicated labelset titles are not allowed")
|
185
|
+
|
177
186
|
await set_labelset(kbid, labelset, item)
|
178
187
|
except KnowledgeBoxNotFound:
|
179
188
|
raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
|
@@ -240,7 +249,7 @@ async def delete_labelset(kbid: str, labelset_id: str):
|
|
240
249
|
async def set_custom_synonyms(request: Request, kbid: str, item: KnowledgeBoxSynonyms):
|
241
250
|
if not await datamanagers.atomic.kb.exists_kb(kbid=kbid):
|
242
251
|
raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
|
243
|
-
synonyms =
|
252
|
+
synonyms = to_proto.kb_synonyms(item)
|
244
253
|
await datamanagers.atomic.synonyms.set(kbid=kbid, synonyms=synonyms)
|
245
254
|
return Response(status_code=204)
|
246
255
|
|
nucliadb/writer/api/v1/upload.py
CHANGED
@@ -23,10 +23,9 @@ import uuid
|
|
23
23
|
from datetime import datetime
|
24
24
|
from hashlib import md5
|
25
25
|
from io import BytesIO
|
26
|
-
from typing import Optional
|
26
|
+
from typing import Annotated, Optional
|
27
27
|
|
28
28
|
from fastapi import HTTPException
|
29
|
-
from fastapi.params import Header
|
30
29
|
from fastapi.requests import Request
|
31
30
|
from fastapi.responses import Response
|
32
31
|
from fastapi_versioning import version
|
@@ -37,6 +36,7 @@ from nucliadb.ingest.orm.utils import set_title
|
|
37
36
|
from nucliadb.ingest.processing import PushPayload, Source
|
38
37
|
from nucliadb.models.responses import HTTPClientError
|
39
38
|
from nucliadb.writer import SERVICE_NAME
|
39
|
+
from nucliadb.writer.api.constants import X_EXTRACT_STRATEGY, X_FILENAME, X_LANGUAGE, X_MD5, X_PASSWORD
|
40
40
|
from nucliadb.writer.api.v1 import transaction
|
41
41
|
from nucliadb.writer.api.v1.resource import (
|
42
42
|
get_rid_from_slug_or_raise_error,
|
@@ -64,8 +64,8 @@ from nucliadb_models import content_types
|
|
64
64
|
from nucliadb_models.resource import NucliaDBRoles
|
65
65
|
from nucliadb_models.utils import FieldIdString
|
66
66
|
from nucliadb_models.writer import CreateResourcePayload, ResourceFileUploaded
|
67
|
-
from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, Metadata
|
68
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage
|
67
|
+
from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FieldID, FieldType, Metadata
|
68
|
+
from nucliadb_protos.writer_pb2 import BrokerMessage, FieldIDStatus, FieldStatus
|
69
69
|
from nucliadb_utils.authentication import requires_one
|
70
70
|
from nucliadb_utils.exceptions import LimitsExceededError, SendToProcessError
|
71
71
|
from nucliadb_utils.storages.storage import KB_RESOURCE_FIELD
|
@@ -74,7 +74,7 @@ from nucliadb_utils.utilities import (
|
|
74
74
|
get_storage,
|
75
75
|
)
|
76
76
|
|
77
|
-
from .router import KB_PREFIX, RESOURCE_PREFIX, RSLUG_PREFIX, api
|
77
|
+
from .router import KB_PREFIX, RESOURCE_PREFIX, RESOURCES_PREFIX, RSLUG_PREFIX, api
|
78
78
|
|
79
79
|
TUS_HEADERS = {
|
80
80
|
"Tus-Resumable": "1.0.0",
|
@@ -142,9 +142,12 @@ async def tus_post_rslug_prefix(
|
|
142
142
|
rslug: str,
|
143
143
|
field: FieldIdString,
|
144
144
|
item: Optional[CreateResourcePayload] = None,
|
145
|
+
x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
|
145
146
|
) -> Response:
|
146
147
|
rid = await get_rid_from_slug_or_raise_error(kbid, rslug)
|
147
|
-
return await _tus_post(
|
148
|
+
return await _tus_post(
|
149
|
+
request, kbid, item, path_rid=rid, field_id=field, extract_strategy=x_extract_strategy
|
150
|
+
)
|
148
151
|
|
149
152
|
|
150
153
|
@api.post(
|
@@ -161,8 +164,11 @@ async def tus_post_rid_prefix(
|
|
161
164
|
path_rid: str,
|
162
165
|
field: FieldIdString,
|
163
166
|
item: Optional[CreateResourcePayload] = None,
|
167
|
+
x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
|
164
168
|
) -> Response:
|
165
|
-
return await _tus_post(
|
169
|
+
return await _tus_post(
|
170
|
+
request, kbid, item, path_rid=path_rid, field_id=field, extract_strategy=x_extract_strategy
|
171
|
+
)
|
166
172
|
|
167
173
|
|
168
174
|
@api.post(
|
@@ -177,8 +183,9 @@ async def tus_post(
|
|
177
183
|
request: Request,
|
178
184
|
kbid: str,
|
179
185
|
item: Optional[CreateResourcePayload] = None,
|
186
|
+
x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
|
180
187
|
) -> Response:
|
181
|
-
return await _tus_post(request, kbid, item)
|
188
|
+
return await _tus_post(request, kbid, item, extract_strategy=x_extract_strategy)
|
182
189
|
|
183
190
|
|
184
191
|
# called by one the three POST above - there are defined distinctly to produce clean API doc
|
@@ -188,6 +195,7 @@ async def _tus_post(
|
|
188
195
|
item: Optional[CreateResourcePayload] = None,
|
189
196
|
path_rid: Optional[str] = None,
|
190
197
|
field_id: Optional[str] = None,
|
198
|
+
extract_strategy: Optional[str] = None,
|
191
199
|
) -> Response:
|
192
200
|
"""
|
193
201
|
An empty POST request is used to create a new upload resource.
|
@@ -219,7 +227,7 @@ async def _tus_post(
|
|
219
227
|
size = int(request.headers["upload-length"])
|
220
228
|
else:
|
221
229
|
if not deferred_length:
|
222
|
-
raise HTTPPreconditionFailed(detail="
|
230
|
+
raise HTTPPreconditionFailed(detail="upload-length header is required")
|
223
231
|
|
224
232
|
if "tus-resumable" not in request.headers:
|
225
233
|
raise HTTPPreconditionFailed(detail="TUS needs a TUS version")
|
@@ -285,6 +293,7 @@ async def _tus_post(
|
|
285
293
|
deferred_length=deferred_length,
|
286
294
|
offset=0,
|
287
295
|
item=creation_payload,
|
296
|
+
extract_strategy=extract_strategy,
|
288
297
|
)
|
289
298
|
|
290
299
|
if size is not None:
|
@@ -502,7 +511,7 @@ async def _tus_patch(
|
|
502
511
|
|
503
512
|
if offset != dm.offset:
|
504
513
|
raise HTTPConflict(
|
505
|
-
detail=f"Current upload offset({offset}) does not match
|
514
|
+
detail=f"Current upload offset({offset}) does not match object offset {dm.offset}"
|
506
515
|
)
|
507
516
|
|
508
517
|
storage_manager = get_storage_manager()
|
@@ -535,8 +544,8 @@ async def _tus_patch(
|
|
535
544
|
raise AttributeError()
|
536
545
|
path = await storage_manager.finish(dm)
|
537
546
|
headers["Tus-Upload-Finished"] = "1"
|
538
|
-
headers["NDB-Resource"] = f"/{KB_PREFIX}/{kbid}/
|
539
|
-
headers["NDB-Field"] = f"/{KB_PREFIX}/{kbid}/
|
547
|
+
headers["NDB-Resource"] = f"/{KB_PREFIX}/{kbid}/{RESOURCES_PREFIX}/{rid}"
|
548
|
+
headers["NDB-Field"] = f"/{KB_PREFIX}/{kbid}/{RESOURCES_PREFIX}/{rid}/field/{field}"
|
540
549
|
|
541
550
|
item_payload = dm.get("item")
|
542
551
|
creation_payload = None
|
@@ -569,6 +578,7 @@ async def _tus_patch(
|
|
569
578
|
request=request,
|
570
579
|
bucket=storage_manager.storage.get_bucket_name(kbid),
|
571
580
|
item=creation_payload,
|
581
|
+
extract_strategy=dm.get("extract_strategy") or None,
|
572
582
|
)
|
573
583
|
except LimitsExceededError as exc:
|
574
584
|
raise HTTPException(status_code=exc.status_code, detail=exc.detail)
|
@@ -602,10 +612,11 @@ async def upload_rslug_prefix(
|
|
602
612
|
kbid: str,
|
603
613
|
rslug: str,
|
604
614
|
field: FieldIdString,
|
605
|
-
x_filename: Optional[
|
606
|
-
x_password: Optional[
|
607
|
-
x_language: Optional[
|
608
|
-
x_md5: Optional[
|
615
|
+
x_filename: Annotated[Optional[str], X_FILENAME] = None,
|
616
|
+
x_password: Annotated[Optional[str], X_PASSWORD] = None,
|
617
|
+
x_language: Annotated[Optional[str], X_LANGUAGE] = None,
|
618
|
+
x_md5: Annotated[Optional[str], X_MD5] = None,
|
619
|
+
x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
|
609
620
|
) -> ResourceFileUploaded:
|
610
621
|
rid = await get_rid_from_slug_or_raise_error(kbid, rslug)
|
611
622
|
return await _upload(
|
@@ -617,6 +628,7 @@ async def upload_rslug_prefix(
|
|
617
628
|
x_password=x_password,
|
618
629
|
x_language=x_language,
|
619
630
|
x_md5=x_md5,
|
631
|
+
x_extract_strategy=x_extract_strategy,
|
620
632
|
)
|
621
633
|
|
622
634
|
|
@@ -634,10 +646,11 @@ async def upload_rid_prefix(
|
|
634
646
|
kbid: str,
|
635
647
|
path_rid: str,
|
636
648
|
field: FieldIdString,
|
637
|
-
x_filename: Optional[
|
638
|
-
x_password: Optional[
|
639
|
-
x_language: Optional[
|
640
|
-
x_md5: Optional[
|
649
|
+
x_filename: Annotated[Optional[str], X_FILENAME] = None,
|
650
|
+
x_password: Annotated[Optional[str], X_PASSWORD] = None,
|
651
|
+
x_language: Annotated[Optional[str], X_LANGUAGE] = None,
|
652
|
+
x_md5: Annotated[Optional[str], X_MD5] = None,
|
653
|
+
x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
|
641
654
|
) -> ResourceFileUploaded:
|
642
655
|
return await _upload(
|
643
656
|
request,
|
@@ -648,6 +661,7 @@ async def upload_rid_prefix(
|
|
648
661
|
x_password=x_password,
|
649
662
|
x_language=x_language,
|
650
663
|
x_md5=x_md5,
|
664
|
+
x_extract_strategy=x_extract_strategy,
|
651
665
|
)
|
652
666
|
|
653
667
|
|
@@ -663,10 +677,11 @@ async def upload_rid_prefix(
|
|
663
677
|
async def upload(
|
664
678
|
request: StarletteRequest,
|
665
679
|
kbid: str,
|
666
|
-
x_filename: Optional[
|
667
|
-
x_password: Optional[
|
668
|
-
x_language: Optional[
|
669
|
-
x_md5: Optional[
|
680
|
+
x_filename: Annotated[Optional[str], X_FILENAME] = None,
|
681
|
+
x_password: Annotated[Optional[str], X_PASSWORD] = None,
|
682
|
+
x_language: Annotated[Optional[str], X_LANGUAGE] = None,
|
683
|
+
x_md5: Annotated[Optional[str], X_MD5] = None,
|
684
|
+
x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
|
670
685
|
) -> ResourceFileUploaded:
|
671
686
|
return await _upload(
|
672
687
|
request,
|
@@ -675,6 +690,7 @@ async def upload(
|
|
675
690
|
x_password=x_password,
|
676
691
|
x_language=x_language,
|
677
692
|
x_md5=x_md5,
|
693
|
+
x_extract_strategy=x_extract_strategy,
|
678
694
|
)
|
679
695
|
|
680
696
|
|
@@ -684,17 +700,18 @@ async def _upload(
|
|
684
700
|
kbid: str,
|
685
701
|
path_rid: Optional[str] = None,
|
686
702
|
field: Optional[str] = None,
|
687
|
-
x_filename: Optional[
|
688
|
-
x_password: Optional[
|
689
|
-
x_language: Optional[
|
690
|
-
x_md5: Optional[
|
703
|
+
x_filename: Optional[str] = None,
|
704
|
+
x_password: Optional[str] = None,
|
705
|
+
x_language: Optional[str] = None,
|
706
|
+
x_md5: Optional[str] = None,
|
707
|
+
x_extract_strategy: Optional[str] = None,
|
691
708
|
) -> ResourceFileUploaded:
|
692
709
|
if path_rid is not None:
|
693
710
|
await validate_rid_exists_or_raise_error(kbid, path_rid)
|
694
711
|
|
695
712
|
await maybe_back_pressure(request, kbid, resource_uuid=path_rid)
|
696
713
|
|
697
|
-
md5_user = x_md5
|
714
|
+
md5_user = x_md5
|
698
715
|
path, rid, valid_field = await validate_field_upload(kbid, path_rid, field, md5_user)
|
699
716
|
dm = get_dm()
|
700
717
|
storage_manager = get_storage_manager()
|
@@ -715,8 +732,8 @@ async def _upload(
|
|
715
732
|
|
716
733
|
await dm.start(request)
|
717
734
|
|
718
|
-
if x_filename
|
719
|
-
filename = maybe_b64decode(x_filename
|
735
|
+
if x_filename is not None:
|
736
|
+
filename = maybe_b64decode(x_filename)
|
720
737
|
else:
|
721
738
|
filename = uuid.uuid4().hex
|
722
739
|
|
@@ -772,15 +789,16 @@ async def _upload(
|
|
772
789
|
content_type=content_type,
|
773
790
|
override_resource_title=implies_resource_creation,
|
774
791
|
filename=filename,
|
775
|
-
password=x_password
|
776
|
-
language=x_language
|
777
|
-
md5=x_md5
|
792
|
+
password=x_password,
|
793
|
+
language=x_language,
|
794
|
+
md5=x_md5,
|
778
795
|
field=valid_field,
|
779
796
|
source=storage_manager.storage.source,
|
780
797
|
rid=rid,
|
781
798
|
path=path,
|
782
799
|
request=request,
|
783
800
|
bucket=storage_manager.storage.get_bucket_name(kbid),
|
801
|
+
extract_strategy=x_extract_strategy,
|
784
802
|
)
|
785
803
|
except LimitsExceededError as exc:
|
786
804
|
raise HTTPException(status_code=exc.status_code, detail=exc.detail)
|
@@ -840,6 +858,7 @@ async def store_file_on_nuclia_db(
|
|
840
858
|
language: Optional[str] = None,
|
841
859
|
md5: Optional[str] = None,
|
842
860
|
item: Optional[CreateResourcePayload] = None,
|
861
|
+
extract_strategy: Optional[str] = None,
|
843
862
|
) -> Optional[int]:
|
844
863
|
# File is on NucliaDB Storage at path
|
845
864
|
partitioning = get_partitioning()
|
@@ -921,10 +940,18 @@ async def store_file_on_nuclia_db(
|
|
921
940
|
file_field.language = language
|
922
941
|
if password:
|
923
942
|
file_field.password = password
|
943
|
+
if extract_strategy is not None:
|
944
|
+
file_field.extract_strategy = extract_strategy
|
924
945
|
|
925
946
|
writer.files[field].CopyFrom(file_field)
|
926
947
|
# Do not store passwords on maindb
|
927
948
|
writer.files[field].ClearField("password")
|
949
|
+
writer.field_statuses.append(
|
950
|
+
FieldIDStatus(
|
951
|
+
id=FieldID(field_type=FieldType.FILE, field=field),
|
952
|
+
status=FieldStatus.Status.PENDING,
|
953
|
+
)
|
954
|
+
)
|
928
955
|
|
929
956
|
toprocess.filefield[field] = await processing.convert_internal_filefield_to_str(
|
930
957
|
file_field, storage=storage
|
@@ -17,37 +17,57 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
#
|
25
|
-
# AGPL:
|
26
|
-
# This program is free software: you can redistribute it and/or modify
|
27
|
-
# it under the terms of the GNU Affero General Public License as
|
28
|
-
# published by the Free Software Foundation, either version 3 of the
|
29
|
-
# License, or (at your option) any later version.
|
30
|
-
#
|
31
|
-
# This program is distributed in the hope that it will be useful,
|
32
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
33
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
34
|
-
# GNU Affero General Public License for more details.
|
35
|
-
#
|
36
|
-
# You should have received a copy of the GNU Affero General Public License
|
37
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
38
|
-
#
|
20
|
+
|
21
|
+
from fastapi import HTTPException, Response
|
22
|
+
from fastapi_versioning import version
|
23
|
+
from starlette.requests import Request
|
39
24
|
|
40
25
|
from nucliadb import learning_proxy
|
41
26
|
from nucliadb.common import datamanagers
|
42
27
|
from nucliadb.ingest.orm.exceptions import VectorSetConflict
|
43
28
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
44
29
|
from nucliadb.writer import logger
|
30
|
+
from nucliadb.writer.api.v1.router import KB_PREFIX, api
|
31
|
+
from nucliadb_models.resource import (
|
32
|
+
NucliaDBRoles,
|
33
|
+
)
|
34
|
+
from nucliadb_models.vectorsets import CreatedVectorSet
|
45
35
|
from nucliadb_protos import knowledgebox_pb2
|
46
36
|
from nucliadb_telemetry import errors
|
37
|
+
from nucliadb_utils.authentication import requires_one
|
47
38
|
from nucliadb_utils.utilities import get_storage
|
48
39
|
|
49
40
|
|
50
|
-
|
41
|
+
@api.post(
|
42
|
+
f"/{KB_PREFIX}/{{kbid}}/vectorsets/{{vectorset_id}}",
|
43
|
+
status_code=201,
|
44
|
+
summary="Add a vector set to Knowledge Box",
|
45
|
+
tags=["VectorSets"],
|
46
|
+
# TODO: remove when the feature is mature
|
47
|
+
include_in_schema=False,
|
48
|
+
)
|
49
|
+
@requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.WRITER])
|
50
|
+
@version(1)
|
51
|
+
async def add_vectorset(request: Request, kbid: str, vectorset_id: str) -> CreatedVectorSet:
|
52
|
+
try:
|
53
|
+
await _add_vectorset(kbid, vectorset_id)
|
54
|
+
|
55
|
+
except learning_proxy.ProxiedLearningConfigError as err:
|
56
|
+
raise HTTPException(
|
57
|
+
status_code=err.status_code,
|
58
|
+
detail=err.content,
|
59
|
+
)
|
60
|
+
|
61
|
+
except VectorSetConflict:
|
62
|
+
raise HTTPException(
|
63
|
+
status_code=409,
|
64
|
+
detail="A vectorset with this embedding model already exists in your KB",
|
65
|
+
)
|
66
|
+
|
67
|
+
return CreatedVectorSet(id=vectorset_id)
|
68
|
+
|
69
|
+
|
70
|
+
async def _add_vectorset(kbid: str, vectorset_id: str) -> None:
|
51
71
|
# First off, add the vectorset to the learning configuration if it's not already there
|
52
72
|
lconfig = await learning_proxy.get_configuration(kbid)
|
53
73
|
assert lconfig is not None
|
@@ -59,34 +79,12 @@ async def add(kbid: str, vectorset_id: str) -> None:
|
|
59
79
|
assert lconfig is not None
|
60
80
|
|
61
81
|
# Then, add the vectorset to the index if it's not already there
|
82
|
+
storage = await get_storage()
|
83
|
+
vectorset_config = get_vectorset_config(lconfig, vectorset_id)
|
62
84
|
async with datamanagers.with_rw_transaction() as txn:
|
63
|
-
kbobj = KnowledgeBox(txn,
|
64
|
-
|
65
|
-
|
66
|
-
await kbobj.create_vectorset(vectorset_config)
|
67
|
-
await txn.commit()
|
68
|
-
except VectorSetConflict:
|
69
|
-
# Vectorset already exists, nothing to do
|
70
|
-
return
|
71
|
-
|
72
|
-
|
73
|
-
async def delete(kbid: str, vectorset_id: str) -> None:
|
74
|
-
lconfig = await learning_proxy.get_configuration(kbid)
|
75
|
-
if lconfig is not None:
|
76
|
-
semantic_models = lconfig.model_dump()["semantic_models"]
|
77
|
-
if vectorset_id in semantic_models:
|
78
|
-
semantic_models.remove(vectorset_id)
|
79
|
-
await learning_proxy.update_configuration(kbid, {"semantic_models": semantic_models})
|
80
|
-
try:
|
81
|
-
async with datamanagers.with_rw_transaction() as txn:
|
82
|
-
kbobj = KnowledgeBox(txn, await get_storage(), kbid)
|
83
|
-
await kbobj.delete_vectorset(vectorset_id=vectorset_id)
|
84
|
-
await txn.commit()
|
85
|
-
except Exception as ex:
|
86
|
-
errors.capture_exception(ex)
|
87
|
-
logger.exception(
|
88
|
-
"Could not delete vectorset from index", extra={"kbid": kbid, "vectorset_id": vectorset_id}
|
89
|
-
)
|
85
|
+
kbobj = KnowledgeBox(txn, storage, kbid)
|
86
|
+
await kbobj.create_vectorset(vectorset_config)
|
87
|
+
await txn.commit()
|
90
88
|
|
91
89
|
|
92
90
|
def get_vectorset_config(
|
@@ -123,3 +121,57 @@ def get_vectorset_config(
|
|
123
121
|
vectorset_index_config.normalize_vectors = False
|
124
122
|
vectorset_config.vectorset_index_config.CopyFrom(vectorset_index_config)
|
125
123
|
return vectorset_config
|
124
|
+
|
125
|
+
|
126
|
+
@api.delete(
|
127
|
+
f"/{KB_PREFIX}/{{kbid}}/vectorsets/{{vectorset_id}}",
|
128
|
+
status_code=204,
|
129
|
+
summary="Delete vector set from Knowledge Box",
|
130
|
+
tags=["VectorSets"],
|
131
|
+
# TODO: remove when the feature is mature
|
132
|
+
include_in_schema=False,
|
133
|
+
)
|
134
|
+
@requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.WRITER])
|
135
|
+
@version(1)
|
136
|
+
async def delete_vectorset(request: Request, kbid: str, vectorset_id: str) -> Response:
|
137
|
+
try:
|
138
|
+
await _delete_vectorset(kbid, vectorset_id)
|
139
|
+
|
140
|
+
except VectorSetConflict as exc:
|
141
|
+
raise HTTPException(
|
142
|
+
status_code=409,
|
143
|
+
detail=str(exc),
|
144
|
+
)
|
145
|
+
|
146
|
+
except learning_proxy.ProxiedLearningConfigError as err:
|
147
|
+
raise HTTPException(
|
148
|
+
status_code=err.status_code,
|
149
|
+
detail=err.content,
|
150
|
+
)
|
151
|
+
|
152
|
+
return Response(status_code=204)
|
153
|
+
|
154
|
+
|
155
|
+
async def _delete_vectorset(kbid: str, vectorset_id: str) -> None:
|
156
|
+
lconfig = await learning_proxy.get_configuration(kbid)
|
157
|
+
if lconfig is not None:
|
158
|
+
semantic_models = lconfig.model_dump()["semantic_models"]
|
159
|
+
if vectorset_id in semantic_models:
|
160
|
+
semantic_models.remove(vectorset_id)
|
161
|
+
await learning_proxy.update_configuration(kbid, {"semantic_models": semantic_models})
|
162
|
+
|
163
|
+
storage = await get_storage()
|
164
|
+
try:
|
165
|
+
async with datamanagers.with_rw_transaction() as txn:
|
166
|
+
kbobj = KnowledgeBox(txn, storage, kbid)
|
167
|
+
await kbobj.delete_vectorset(vectorset_id=vectorset_id)
|
168
|
+
await txn.commit()
|
169
|
+
|
170
|
+
except VectorSetConflict:
|
171
|
+
# caller should handle this error
|
172
|
+
raise
|
173
|
+
except Exception as ex:
|
174
|
+
errors.capture_exception(ex)
|
175
|
+
logger.exception(
|
176
|
+
"Could not delete vectorset from index", extra={"kbid": kbid, "vectorset_id": vectorset_id}
|
177
|
+
)
|