nucliadb 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. migrations/0028_extracted_vectors_reference.py +61 -0
  2. migrations/0029_backfill_field_status.py +149 -0
  3. migrations/0030_label_deduplication.py +60 -0
  4. nucliadb/common/cluster/manager.py +41 -331
  5. nucliadb/common/cluster/rebalance.py +2 -2
  6. nucliadb/common/cluster/rollover.py +12 -71
  7. nucliadb/common/cluster/settings.py +3 -0
  8. nucliadb/common/cluster/standalone/utils.py +0 -43
  9. nucliadb/common/cluster/utils.py +0 -16
  10. nucliadb/common/counters.py +1 -0
  11. nucliadb/common/datamanagers/fields.py +48 -7
  12. nucliadb/common/datamanagers/vectorsets.py +11 -2
  13. nucliadb/common/external_index_providers/base.py +2 -1
  14. nucliadb/common/external_index_providers/pinecone.py +3 -5
  15. nucliadb/common/ids.py +18 -4
  16. nucliadb/common/models_utils/from_proto.py +479 -0
  17. nucliadb/common/models_utils/to_proto.py +60 -0
  18. nucliadb/common/nidx.py +76 -37
  19. nucliadb/export_import/models.py +3 -3
  20. nucliadb/health.py +0 -7
  21. nucliadb/ingest/app.py +0 -8
  22. nucliadb/ingest/consumer/auditing.py +1 -1
  23. nucliadb/ingest/consumer/shard_creator.py +1 -1
  24. nucliadb/ingest/fields/base.py +83 -21
  25. nucliadb/ingest/orm/brain.py +55 -56
  26. nucliadb/ingest/orm/broker_message.py +12 -2
  27. nucliadb/ingest/orm/entities.py +6 -17
  28. nucliadb/ingest/orm/knowledgebox.py +44 -22
  29. nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
  30. nucliadb/ingest/orm/processor/processor.py +5 -2
  31. nucliadb/ingest/orm/resource.py +222 -413
  32. nucliadb/ingest/processing.py +8 -2
  33. nucliadb/ingest/serialize.py +77 -46
  34. nucliadb/ingest/service/writer.py +2 -56
  35. nucliadb/ingest/settings.py +1 -4
  36. nucliadb/learning_proxy.py +6 -4
  37. nucliadb/purge/__init__.py +102 -12
  38. nucliadb/purge/orphan_shards.py +6 -4
  39. nucliadb/reader/api/models.py +3 -3
  40. nucliadb/reader/api/v1/__init__.py +1 -0
  41. nucliadb/reader/api/v1/download.py +2 -2
  42. nucliadb/reader/api/v1/knowledgebox.py +3 -3
  43. nucliadb/reader/api/v1/resource.py +23 -12
  44. nucliadb/reader/api/v1/services.py +4 -4
  45. nucliadb/reader/api/v1/vectorsets.py +48 -0
  46. nucliadb/search/api/v1/ask.py +11 -1
  47. nucliadb/search/api/v1/feedback.py +3 -3
  48. nucliadb/search/api/v1/knowledgebox.py +8 -13
  49. nucliadb/search/api/v1/search.py +3 -2
  50. nucliadb/search/api/v1/suggest.py +0 -2
  51. nucliadb/search/predict.py +6 -4
  52. nucliadb/search/requesters/utils.py +1 -2
  53. nucliadb/search/search/chat/ask.py +77 -13
  54. nucliadb/search/search/chat/prompt.py +16 -5
  55. nucliadb/search/search/chat/query.py +74 -34
  56. nucliadb/search/search/exceptions.py +2 -7
  57. nucliadb/search/search/find.py +9 -5
  58. nucliadb/search/search/find_merge.py +10 -4
  59. nucliadb/search/search/graph_strategy.py +884 -0
  60. nucliadb/search/search/hydrator.py +6 -0
  61. nucliadb/search/search/merge.py +79 -24
  62. nucliadb/search/search/query.py +74 -245
  63. nucliadb/search/search/query_parser/exceptions.py +11 -1
  64. nucliadb/search/search/query_parser/fetcher.py +405 -0
  65. nucliadb/search/search/query_parser/models.py +0 -3
  66. nucliadb/search/search/query_parser/parser.py +22 -21
  67. nucliadb/search/search/rerankers.py +1 -42
  68. nucliadb/search/search/shards.py +19 -0
  69. nucliadb/standalone/api_router.py +2 -14
  70. nucliadb/standalone/settings.py +4 -0
  71. nucliadb/train/generators/field_streaming.py +7 -3
  72. nucliadb/train/lifecycle.py +3 -6
  73. nucliadb/train/nodes.py +14 -12
  74. nucliadb/train/resource.py +380 -0
  75. nucliadb/writer/api/constants.py +20 -16
  76. nucliadb/writer/api/v1/__init__.py +1 -0
  77. nucliadb/writer/api/v1/export_import.py +1 -1
  78. nucliadb/writer/api/v1/field.py +13 -7
  79. nucliadb/writer/api/v1/knowledgebox.py +3 -46
  80. nucliadb/writer/api/v1/resource.py +20 -13
  81. nucliadb/writer/api/v1/services.py +10 -1
  82. nucliadb/writer/api/v1/upload.py +61 -34
  83. nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
  84. nucliadb/writer/back_pressure.py +17 -46
  85. nucliadb/writer/resource/basic.py +9 -7
  86. nucliadb/writer/resource/field.py +42 -9
  87. nucliadb/writer/settings.py +2 -2
  88. nucliadb/writer/tus/gcs.py +11 -10
  89. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
  90. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
  91. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
  92. nucliadb/common/cluster/discovery/base.py +0 -178
  93. nucliadb/common/cluster/discovery/k8s.py +0 -301
  94. nucliadb/common/cluster/discovery/manual.py +0 -57
  95. nucliadb/common/cluster/discovery/single.py +0 -51
  96. nucliadb/common/cluster/discovery/types.py +0 -32
  97. nucliadb/common/cluster/discovery/utils.py +0 -67
  98. nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
  99. nucliadb/common/cluster/standalone/index_node.py +0 -123
  100. nucliadb/common/cluster/standalone/service.py +0 -84
  101. nucliadb/standalone/introspect.py +0 -208
  102. nucliadb-6.2.0.post2679.dist-info/zip-safe +0 -1
  103. /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
  104. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
  105. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -20,7 +20,7 @@
20
20
  import asyncio
21
21
  from functools import partial
22
22
 
23
- from fastapi import HTTPException, Response
23
+ from fastapi import HTTPException
24
24
  from fastapi_versioning import version
25
25
  from starlette.requests import Request
26
26
 
@@ -32,7 +32,7 @@ from nucliadb.common.external_index_providers.exceptions import (
32
32
  from nucliadb.common.maindb.utils import get_driver
33
33
  from nucliadb.ingest.orm.exceptions import KnowledgeBoxConflict
34
34
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
35
- from nucliadb.writer import logger, vectorsets
35
+ from nucliadb.writer import logger
36
36
  from nucliadb.writer.api.utils import only_for_onprem
37
37
  from nucliadb.writer.api.v1.router import KB_PREFIX, KBS_PREFIX, api
38
38
  from nucliadb.writer.utilities import get_processing
@@ -68,6 +68,7 @@ async def create_kb_endpoint(request: Request, item: KnowledgeBoxConfig) -> Know
68
68
  except ExternalIndexCreationError as exc:
69
69
  raise HTTPException(status_code=502, detail=str(exc))
70
70
  except Exception:
71
+ logger.exception("Could not create KB")
71
72
  raise HTTPException(status_code=500, detail="Error creating knowledge box")
72
73
  else:
73
74
  return KnowledgeBoxObj(uuid=kbid, slug=slug)
@@ -247,47 +248,3 @@ def to_pinecone_serverless_cloud_pb(
247
248
  PineconeServerlessCloud.AZURE_EASTUS2: knowledgebox_pb2.PineconeServerlessCloud.AZURE_EASTUS2,
248
249
  PineconeServerlessCloud.GCP_US_CENTRAL1: knowledgebox_pb2.PineconeServerlessCloud.GCP_US_CENTRAL1,
249
250
  }[serverless]
250
-
251
-
252
- @api.post(
253
- f"/{KB_PREFIX}/{{kbid}}/vectorsets/{{vectorset_id}}",
254
- status_code=200,
255
- summary="Add a vectorset to Knowledge Box",
256
- tags=["Knowledge Boxes"],
257
- # TODO: remove when the feature is mature
258
- include_in_schema=False,
259
- )
260
- @requires(NucliaDBRoles.MANAGER)
261
- @version(1)
262
- async def add_vectorset(request: Request, kbid: str, vectorset_id: str) -> Response:
263
- try:
264
- await vectorsets.add(kbid, vectorset_id)
265
- except learning_proxy.ProxiedLearningConfigError as err:
266
- return Response(
267
- status_code=err.status_code,
268
- content=err.content,
269
- media_type=err.content_type,
270
- )
271
- return Response(status_code=200)
272
-
273
-
274
- @api.delete(
275
- f"/{KB_PREFIX}/{{kbid}}/vectorsets/{{vectorset_id}}",
276
- status_code=200,
277
- summary="Delete vectorset from Knowledge Box",
278
- tags=["Knowledge Boxes"],
279
- # TODO: remove when the feature is mature
280
- include_in_schema=False,
281
- )
282
- @requires(NucliaDBRoles.MANAGER)
283
- @version(1)
284
- async def delete_vectorset(request: Request, kbid: str, vectorset_id: str) -> Response:
285
- try:
286
- await vectorsets.delete(kbid, vectorset_id)
287
- except learning_proxy.ProxiedLearningConfigError as err:
288
- return Response(
289
- status_code=err.status_code,
290
- content=err.content,
291
- media_type=err.content_type,
292
- )
293
- return Response(status_code=200)
@@ -20,7 +20,7 @@
20
20
  import asyncio
21
21
  import contextlib
22
22
  from time import time
23
- from typing import Optional
23
+ from typing import Annotated, Optional
24
24
  from uuid import uuid4
25
25
 
26
26
  from fastapi import HTTPException, Query, Response
@@ -35,7 +35,7 @@ from nucliadb.common.maindb.utils import get_driver
35
35
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
36
36
  from nucliadb.ingest.processing import ProcessingInfo, PushPayload, Source
37
37
  from nucliadb.writer import SERVICE_NAME, logger
38
- from nucliadb.writer.api.constants import SKIP_STORE_DEFAULT, X_NUCLIADB_USER
38
+ from nucliadb.writer.api.constants import X_NUCLIADB_USER, X_SKIP_STORE
39
39
  from nucliadb.writer.api.v1 import transaction
40
40
  from nucliadb.writer.api.v1.router import (
41
41
  KB_PREFIX,
@@ -63,8 +63,8 @@ from nucliadb_models.writer import (
63
63
  ResourceUpdated,
64
64
  UpdateResourcePayload,
65
65
  )
66
- from nucliadb_protos.resources_pb2 import Metadata
67
- from nucliadb_protos.writer_pb2 import BrokerMessage, IndexResource
66
+ from nucliadb_protos.resources_pb2 import FieldID, Metadata
67
+ from nucliadb_protos.writer_pb2 import BrokerMessage, FieldIDStatus, FieldStatus, IndexResource
68
68
  from nucliadb_telemetry.errors import capture_exception
69
69
  from nucliadb_utils.authentication import requires
70
70
  from nucliadb_utils.exceptions import LimitsExceededError, SendToProcessError
@@ -90,8 +90,8 @@ async def create_resource(
90
90
  request: Request,
91
91
  item: CreateResourcePayload,
92
92
  kbid: str,
93
- x_skip_store: bool = SKIP_STORE_DEFAULT,
94
- x_nucliadb_user: str = X_NUCLIADB_USER,
93
+ x_skip_store: Annotated[bool, X_SKIP_STORE] = False,
94
+ x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
95
95
  ):
96
96
  kb_config = await datamanagers.atomic.kb.get_config(kbid=kbid)
97
97
  if item.hidden and not (kb_config and kb_config.hidden_resources_enabled):
@@ -180,8 +180,8 @@ async def modify_resource_rslug_prefix(
180
180
  kbid: str,
181
181
  rslug: str,
182
182
  item: UpdateResourcePayload,
183
- x_skip_store: bool = SKIP_STORE_DEFAULT,
184
- x_nucliadb_user: str = X_NUCLIADB_USER,
183
+ x_skip_store: Annotated[bool, X_SKIP_STORE] = False,
184
+ x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
185
185
  ):
186
186
  rid = await get_rid_from_slug_or_raise_error(kbid, rslug)
187
187
  return await modify_resource_endpoint(
@@ -208,8 +208,8 @@ async def modify_resource_rid_prefix(
208
208
  kbid: str,
209
209
  rid: str,
210
210
  item: UpdateResourcePayload,
211
- x_skip_store: bool = SKIP_STORE_DEFAULT,
212
- x_nucliadb_user: str = X_NUCLIADB_USER,
211
+ x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
212
+ x_skip_store: Annotated[bool, X_SKIP_STORE] = False,
213
213
  ):
214
214
  return await modify_resource_endpoint(
215
215
  request,
@@ -371,7 +371,7 @@ async def reprocess_resource_rslug_prefix(
371
371
  request: Request,
372
372
  kbid: str,
373
373
  rslug: str,
374
- x_nucliadb_user: str = X_NUCLIADB_USER,
374
+ x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
375
375
  ):
376
376
  rid = await get_rid_from_slug_or_raise_error(kbid, rslug)
377
377
  return await _reprocess_resource(request, kbid, rid, x_nucliadb_user=x_nucliadb_user)
@@ -390,7 +390,7 @@ async def reprocess_resource_rid_prefix(
390
390
  request: Request,
391
391
  kbid: str,
392
392
  rid: str,
393
- x_nucliadb_user: str = X_NUCLIADB_USER,
393
+ x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
394
394
  ):
395
395
  return await _reprocess_resource(request, kbid, rid, x_nucliadb_user=x_nucliadb_user)
396
396
 
@@ -422,6 +422,7 @@ async def _reprocess_resource(
422
422
  storage = await get_storage(service_name=SERVICE_NAME)
423
423
  driver = get_driver()
424
424
 
425
+ writer = BrokerMessage()
425
426
  async with driver.transaction() as txn:
426
427
  kb = KnowledgeBox(txn, storage, kbid)
427
428
 
@@ -430,8 +431,14 @@ async def _reprocess_resource(
430
431
  raise HTTPException(status_code=404, detail="Resource does not exist")
431
432
 
432
433
  await extract_fields(resource=resource, toprocess=toprocess)
434
+ for field_type, field_id in resource.fields.keys():
435
+ writer.field_statuses.append(
436
+ FieldIDStatus(
437
+ id=FieldID(field_type=field_type, field=field_id),
438
+ status=FieldStatus.Status.PENDING,
439
+ )
440
+ )
433
441
 
434
- writer = BrokerMessage()
435
442
  writer.kbid = kbid
436
443
  writer.uuid = rid
437
444
  writer.source = BrokerMessage.MessageSource.WRITER
@@ -23,6 +23,7 @@ from starlette.requests import Request
23
23
 
24
24
  from nucliadb.common import datamanagers
25
25
  from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
26
+ from nucliadb.common.models_utils import to_proto
26
27
  from nucliadb.models.responses import (
27
28
  HTTPConflict,
28
29
  HTTPInternalServerError,
@@ -173,7 +174,15 @@ async def delete_entities(request: Request, kbid: str, group: str):
173
174
  @requires(NucliaDBRoles.WRITER)
174
175
  @version(1)
175
176
  async def set_labelset_endpoint(request: Request, kbid: str, labelset: str, item: LabelSet):
177
+ if item.title is None:
178
+ item.title = labelset
179
+
176
180
  try:
181
+ labelsets = await datamanagers.atomic.labelset.get_all(kbid=kbid)
182
+ labelset_titles = [ls.title.lower() for (k, ls) in labelsets.labelset.items() if k != labelset]
183
+ if item.title.lower() in labelset_titles:
184
+ raise HTTPException(status_code=422, detail="Duplicated labelset titles are not allowed")
185
+
177
186
  await set_labelset(kbid, labelset, item)
178
187
  except KnowledgeBoxNotFound:
179
188
  raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
@@ -240,7 +249,7 @@ async def delete_labelset(kbid: str, labelset_id: str):
240
249
  async def set_custom_synonyms(request: Request, kbid: str, item: KnowledgeBoxSynonyms):
241
250
  if not await datamanagers.atomic.kb.exists_kb(kbid=kbid):
242
251
  raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
243
- synonyms = item.to_message()
252
+ synonyms = to_proto.kb_synonyms(item)
244
253
  await datamanagers.atomic.synonyms.set(kbid=kbid, synonyms=synonyms)
245
254
  return Response(status_code=204)
246
255
 
@@ -23,10 +23,9 @@ import uuid
23
23
  from datetime import datetime
24
24
  from hashlib import md5
25
25
  from io import BytesIO
26
- from typing import Optional
26
+ from typing import Annotated, Optional
27
27
 
28
28
  from fastapi import HTTPException
29
- from fastapi.params import Header
30
29
  from fastapi.requests import Request
31
30
  from fastapi.responses import Response
32
31
  from fastapi_versioning import version
@@ -37,6 +36,7 @@ from nucliadb.ingest.orm.utils import set_title
37
36
  from nucliadb.ingest.processing import PushPayload, Source
38
37
  from nucliadb.models.responses import HTTPClientError
39
38
  from nucliadb.writer import SERVICE_NAME
39
+ from nucliadb.writer.api.constants import X_EXTRACT_STRATEGY, X_FILENAME, X_LANGUAGE, X_MD5, X_PASSWORD
40
40
  from nucliadb.writer.api.v1 import transaction
41
41
  from nucliadb.writer.api.v1.resource import (
42
42
  get_rid_from_slug_or_raise_error,
@@ -64,8 +64,8 @@ from nucliadb_models import content_types
64
64
  from nucliadb_models.resource import NucliaDBRoles
65
65
  from nucliadb_models.utils import FieldIdString
66
66
  from nucliadb_models.writer import CreateResourcePayload, ResourceFileUploaded
67
- from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, Metadata
68
- from nucliadb_protos.writer_pb2 import BrokerMessage
67
+ from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FieldID, FieldType, Metadata
68
+ from nucliadb_protos.writer_pb2 import BrokerMessage, FieldIDStatus, FieldStatus
69
69
  from nucliadb_utils.authentication import requires_one
70
70
  from nucliadb_utils.exceptions import LimitsExceededError, SendToProcessError
71
71
  from nucliadb_utils.storages.storage import KB_RESOURCE_FIELD
@@ -74,7 +74,7 @@ from nucliadb_utils.utilities import (
74
74
  get_storage,
75
75
  )
76
76
 
77
- from .router import KB_PREFIX, RESOURCE_PREFIX, RSLUG_PREFIX, api
77
+ from .router import KB_PREFIX, RESOURCE_PREFIX, RESOURCES_PREFIX, RSLUG_PREFIX, api
78
78
 
79
79
  TUS_HEADERS = {
80
80
  "Tus-Resumable": "1.0.0",
@@ -142,9 +142,12 @@ async def tus_post_rslug_prefix(
142
142
  rslug: str,
143
143
  field: FieldIdString,
144
144
  item: Optional[CreateResourcePayload] = None,
145
+ x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
145
146
  ) -> Response:
146
147
  rid = await get_rid_from_slug_or_raise_error(kbid, rslug)
147
- return await _tus_post(request, kbid, item, path_rid=rid, field_id=field)
148
+ return await _tus_post(
149
+ request, kbid, item, path_rid=rid, field_id=field, extract_strategy=x_extract_strategy
150
+ )
148
151
 
149
152
 
150
153
  @api.post(
@@ -161,8 +164,11 @@ async def tus_post_rid_prefix(
161
164
  path_rid: str,
162
165
  field: FieldIdString,
163
166
  item: Optional[CreateResourcePayload] = None,
167
+ x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
164
168
  ) -> Response:
165
- return await _tus_post(request, kbid, item, path_rid=path_rid, field_id=field)
169
+ return await _tus_post(
170
+ request, kbid, item, path_rid=path_rid, field_id=field, extract_strategy=x_extract_strategy
171
+ )
166
172
 
167
173
 
168
174
  @api.post(
@@ -177,8 +183,9 @@ async def tus_post(
177
183
  request: Request,
178
184
  kbid: str,
179
185
  item: Optional[CreateResourcePayload] = None,
186
+ x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
180
187
  ) -> Response:
181
- return await _tus_post(request, kbid, item)
188
+ return await _tus_post(request, kbid, item, extract_strategy=x_extract_strategy)
182
189
 
183
190
 
184
191
  # called by one the three POST above - there are defined distinctly to produce clean API doc
@@ -188,6 +195,7 @@ async def _tus_post(
188
195
  item: Optional[CreateResourcePayload] = None,
189
196
  path_rid: Optional[str] = None,
190
197
  field_id: Optional[str] = None,
198
+ extract_strategy: Optional[str] = None,
191
199
  ) -> Response:
192
200
  """
193
201
  An empty POST request is used to create a new upload resource.
@@ -219,7 +227,7 @@ async def _tus_post(
219
227
  size = int(request.headers["upload-length"])
220
228
  else:
221
229
  if not deferred_length:
222
- raise HTTPPreconditionFailed(detail="We need upload-length header")
230
+ raise HTTPPreconditionFailed(detail="upload-length header is required")
223
231
 
224
232
  if "tus-resumable" not in request.headers:
225
233
  raise HTTPPreconditionFailed(detail="TUS needs a TUS version")
@@ -285,6 +293,7 @@ async def _tus_post(
285
293
  deferred_length=deferred_length,
286
294
  offset=0,
287
295
  item=creation_payload,
296
+ extract_strategy=extract_strategy,
288
297
  )
289
298
 
290
299
  if size is not None:
@@ -502,7 +511,7 @@ async def _tus_patch(
502
511
 
503
512
  if offset != dm.offset:
504
513
  raise HTTPConflict(
505
- detail=f"Current upload offset({offset}) does not match " f"object offset {dm.offset}"
514
+ detail=f"Current upload offset({offset}) does not match object offset {dm.offset}"
506
515
  )
507
516
 
508
517
  storage_manager = get_storage_manager()
@@ -535,8 +544,8 @@ async def _tus_patch(
535
544
  raise AttributeError()
536
545
  path = await storage_manager.finish(dm)
537
546
  headers["Tus-Upload-Finished"] = "1"
538
- headers["NDB-Resource"] = f"/{KB_PREFIX}/{kbid}/resources/{rid}"
539
- headers["NDB-Field"] = f"/{KB_PREFIX}/{kbid}/resources/{rid}/field/{field}"
547
+ headers["NDB-Resource"] = f"/{KB_PREFIX}/{kbid}/{RESOURCES_PREFIX}/{rid}"
548
+ headers["NDB-Field"] = f"/{KB_PREFIX}/{kbid}/{RESOURCES_PREFIX}/{rid}/field/{field}"
540
549
 
541
550
  item_payload = dm.get("item")
542
551
  creation_payload = None
@@ -569,6 +578,7 @@ async def _tus_patch(
569
578
  request=request,
570
579
  bucket=storage_manager.storage.get_bucket_name(kbid),
571
580
  item=creation_payload,
581
+ extract_strategy=dm.get("extract_strategy") or None,
572
582
  )
573
583
  except LimitsExceededError as exc:
574
584
  raise HTTPException(status_code=exc.status_code, detail=exc.detail)
@@ -602,10 +612,11 @@ async def upload_rslug_prefix(
602
612
  kbid: str,
603
613
  rslug: str,
604
614
  field: FieldIdString,
605
- x_filename: Optional[list[str]] = Header(None), # type: ignore
606
- x_password: Optional[list[str]] = Header(None), # type: ignore
607
- x_language: Optional[list[str]] = Header(None), # type: ignore
608
- x_md5: Optional[list[str]] = Header(None), # type: ignore
615
+ x_filename: Annotated[Optional[str], X_FILENAME] = None,
616
+ x_password: Annotated[Optional[str], X_PASSWORD] = None,
617
+ x_language: Annotated[Optional[str], X_LANGUAGE] = None,
618
+ x_md5: Annotated[Optional[str], X_MD5] = None,
619
+ x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
609
620
  ) -> ResourceFileUploaded:
610
621
  rid = await get_rid_from_slug_or_raise_error(kbid, rslug)
611
622
  return await _upload(
@@ -617,6 +628,7 @@ async def upload_rslug_prefix(
617
628
  x_password=x_password,
618
629
  x_language=x_language,
619
630
  x_md5=x_md5,
631
+ x_extract_strategy=x_extract_strategy,
620
632
  )
621
633
 
622
634
 
@@ -634,10 +646,11 @@ async def upload_rid_prefix(
634
646
  kbid: str,
635
647
  path_rid: str,
636
648
  field: FieldIdString,
637
- x_filename: Optional[list[str]] = Header(None), # type: ignore
638
- x_password: Optional[list[str]] = Header(None), # type: ignore
639
- x_language: Optional[list[str]] = Header(None), # type: ignore
640
- x_md5: Optional[list[str]] = Header(None), # type: ignore
649
+ x_filename: Annotated[Optional[str], X_FILENAME] = None,
650
+ x_password: Annotated[Optional[str], X_PASSWORD] = None,
651
+ x_language: Annotated[Optional[str], X_LANGUAGE] = None,
652
+ x_md5: Annotated[Optional[str], X_MD5] = None,
653
+ x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
641
654
  ) -> ResourceFileUploaded:
642
655
  return await _upload(
643
656
  request,
@@ -648,6 +661,7 @@ async def upload_rid_prefix(
648
661
  x_password=x_password,
649
662
  x_language=x_language,
650
663
  x_md5=x_md5,
664
+ x_extract_strategy=x_extract_strategy,
651
665
  )
652
666
 
653
667
 
@@ -663,10 +677,11 @@ async def upload_rid_prefix(
663
677
  async def upload(
664
678
  request: StarletteRequest,
665
679
  kbid: str,
666
- x_filename: Optional[list[str]] = Header(None), # type: ignore
667
- x_password: Optional[list[str]] = Header(None), # type: ignore
668
- x_language: Optional[list[str]] = Header(None), # type: ignore
669
- x_md5: Optional[list[str]] = Header(None), # type: ignore
680
+ x_filename: Annotated[Optional[str], X_FILENAME] = None,
681
+ x_password: Annotated[Optional[str], X_PASSWORD] = None,
682
+ x_language: Annotated[Optional[str], X_LANGUAGE] = None,
683
+ x_md5: Annotated[Optional[str], X_MD5] = None,
684
+ x_extract_strategy: Annotated[Optional[str], X_EXTRACT_STRATEGY] = None,
670
685
  ) -> ResourceFileUploaded:
671
686
  return await _upload(
672
687
  request,
@@ -675,6 +690,7 @@ async def upload(
675
690
  x_password=x_password,
676
691
  x_language=x_language,
677
692
  x_md5=x_md5,
693
+ x_extract_strategy=x_extract_strategy,
678
694
  )
679
695
 
680
696
 
@@ -684,17 +700,18 @@ async def _upload(
684
700
  kbid: str,
685
701
  path_rid: Optional[str] = None,
686
702
  field: Optional[str] = None,
687
- x_filename: Optional[list[str]] = Header(None), # type: ignore
688
- x_password: Optional[list[str]] = Header(None), # type: ignore
689
- x_language: Optional[list[str]] = Header(None), # type: ignore
690
- x_md5: Optional[list[str]] = Header(None), # type: ignore
703
+ x_filename: Optional[str] = None,
704
+ x_password: Optional[str] = None,
705
+ x_language: Optional[str] = None,
706
+ x_md5: Optional[str] = None,
707
+ x_extract_strategy: Optional[str] = None,
691
708
  ) -> ResourceFileUploaded:
692
709
  if path_rid is not None:
693
710
  await validate_rid_exists_or_raise_error(kbid, path_rid)
694
711
 
695
712
  await maybe_back_pressure(request, kbid, resource_uuid=path_rid)
696
713
 
697
- md5_user = x_md5[0] if x_md5 is not None and len(x_md5) > 0 else None
714
+ md5_user = x_md5
698
715
  path, rid, valid_field = await validate_field_upload(kbid, path_rid, field, md5_user)
699
716
  dm = get_dm()
700
717
  storage_manager = get_storage_manager()
@@ -715,8 +732,8 @@ async def _upload(
715
732
 
716
733
  await dm.start(request)
717
734
 
718
- if x_filename and len(x_filename):
719
- filename = maybe_b64decode(x_filename[0])
735
+ if x_filename is not None:
736
+ filename = maybe_b64decode(x_filename)
720
737
  else:
721
738
  filename = uuid.uuid4().hex
722
739
 
@@ -772,15 +789,16 @@ async def _upload(
772
789
  content_type=content_type,
773
790
  override_resource_title=implies_resource_creation,
774
791
  filename=filename,
775
- password=x_password[0] if x_password and len(x_password) else None,
776
- language=x_language[0] if x_language and len(x_language) else None,
777
- md5=x_md5[0] if x_md5 and len(x_md5) else None,
792
+ password=x_password,
793
+ language=x_language,
794
+ md5=x_md5,
778
795
  field=valid_field,
779
796
  source=storage_manager.storage.source,
780
797
  rid=rid,
781
798
  path=path,
782
799
  request=request,
783
800
  bucket=storage_manager.storage.get_bucket_name(kbid),
801
+ extract_strategy=x_extract_strategy,
784
802
  )
785
803
  except LimitsExceededError as exc:
786
804
  raise HTTPException(status_code=exc.status_code, detail=exc.detail)
@@ -840,6 +858,7 @@ async def store_file_on_nuclia_db(
840
858
  language: Optional[str] = None,
841
859
  md5: Optional[str] = None,
842
860
  item: Optional[CreateResourcePayload] = None,
861
+ extract_strategy: Optional[str] = None,
843
862
  ) -> Optional[int]:
844
863
  # File is on NucliaDB Storage at path
845
864
  partitioning = get_partitioning()
@@ -921,10 +940,18 @@ async def store_file_on_nuclia_db(
921
940
  file_field.language = language
922
941
  if password:
923
942
  file_field.password = password
943
+ if extract_strategy is not None:
944
+ file_field.extract_strategy = extract_strategy
924
945
 
925
946
  writer.files[field].CopyFrom(file_field)
926
947
  # Do not store passwords on maindb
927
948
  writer.files[field].ClearField("password")
949
+ writer.field_statuses.append(
950
+ FieldIDStatus(
951
+ id=FieldID(field_type=FieldType.FILE, field=field),
952
+ status=FieldStatus.Status.PENDING,
953
+ )
954
+ )
928
955
 
929
956
  toprocess.filefield[field] = await processing.convert_internal_filefield_to_str(
930
957
  file_field, storage=storage
@@ -17,37 +17,57 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- # Copyright (C) 2021 Bosutech XXI S.L.
21
- #
22
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
23
- # For commercial licensing, contact us at info@nuclia.com.
24
- #
25
- # AGPL:
26
- # This program is free software: you can redistribute it and/or modify
27
- # it under the terms of the GNU Affero General Public License as
28
- # published by the Free Software Foundation, either version 3 of the
29
- # License, or (at your option) any later version.
30
- #
31
- # This program is distributed in the hope that it will be useful,
32
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
33
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34
- # GNU Affero General Public License for more details.
35
- #
36
- # You should have received a copy of the GNU Affero General Public License
37
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
38
- #
20
+
21
+ from fastapi import HTTPException, Response
22
+ from fastapi_versioning import version
23
+ from starlette.requests import Request
39
24
 
40
25
  from nucliadb import learning_proxy
41
26
  from nucliadb.common import datamanagers
42
27
  from nucliadb.ingest.orm.exceptions import VectorSetConflict
43
28
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
44
29
  from nucliadb.writer import logger
30
+ from nucliadb.writer.api.v1.router import KB_PREFIX, api
31
+ from nucliadb_models.resource import (
32
+ NucliaDBRoles,
33
+ )
34
+ from nucliadb_models.vectorsets import CreatedVectorSet
45
35
  from nucliadb_protos import knowledgebox_pb2
46
36
  from nucliadb_telemetry import errors
37
+ from nucliadb_utils.authentication import requires_one
47
38
  from nucliadb_utils.utilities import get_storage
48
39
 
49
40
 
50
- async def add(kbid: str, vectorset_id: str) -> None:
41
+ @api.post(
42
+ f"/{KB_PREFIX}/{{kbid}}/vectorsets/{{vectorset_id}}",
43
+ status_code=201,
44
+ summary="Add a vector set to Knowledge Box",
45
+ tags=["VectorSets"],
46
+ # TODO: remove when the feature is mature
47
+ include_in_schema=False,
48
+ )
49
+ @requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.WRITER])
50
+ @version(1)
51
+ async def add_vectorset(request: Request, kbid: str, vectorset_id: str) -> CreatedVectorSet:
52
+ try:
53
+ await _add_vectorset(kbid, vectorset_id)
54
+
55
+ except learning_proxy.ProxiedLearningConfigError as err:
56
+ raise HTTPException(
57
+ status_code=err.status_code,
58
+ detail=err.content,
59
+ )
60
+
61
+ except VectorSetConflict:
62
+ raise HTTPException(
63
+ status_code=409,
64
+ detail="A vectorset with this embedding model already exists in your KB",
65
+ )
66
+
67
+ return CreatedVectorSet(id=vectorset_id)
68
+
69
+
70
+ async def _add_vectorset(kbid: str, vectorset_id: str) -> None:
51
71
  # First off, add the vectorset to the learning configuration if it's not already there
52
72
  lconfig = await learning_proxy.get_configuration(kbid)
53
73
  assert lconfig is not None
@@ -59,34 +79,12 @@ async def add(kbid: str, vectorset_id: str) -> None:
59
79
  assert lconfig is not None
60
80
 
61
81
  # Then, add the vectorset to the index if it's not already there
82
+ storage = await get_storage()
83
+ vectorset_config = get_vectorset_config(lconfig, vectorset_id)
62
84
  async with datamanagers.with_rw_transaction() as txn:
63
- kbobj = KnowledgeBox(txn, await get_storage(), kbid)
64
- vectorset_config = get_vectorset_config(lconfig, vectorset_id)
65
- try:
66
- await kbobj.create_vectorset(vectorset_config)
67
- await txn.commit()
68
- except VectorSetConflict:
69
- # Vectorset already exists, nothing to do
70
- return
71
-
72
-
73
- async def delete(kbid: str, vectorset_id: str) -> None:
74
- lconfig = await learning_proxy.get_configuration(kbid)
75
- if lconfig is not None:
76
- semantic_models = lconfig.model_dump()["semantic_models"]
77
- if vectorset_id in semantic_models:
78
- semantic_models.remove(vectorset_id)
79
- await learning_proxy.update_configuration(kbid, {"semantic_models": semantic_models})
80
- try:
81
- async with datamanagers.with_rw_transaction() as txn:
82
- kbobj = KnowledgeBox(txn, await get_storage(), kbid)
83
- await kbobj.delete_vectorset(vectorset_id=vectorset_id)
84
- await txn.commit()
85
- except Exception as ex:
86
- errors.capture_exception(ex)
87
- logger.exception(
88
- "Could not delete vectorset from index", extra={"kbid": kbid, "vectorset_id": vectorset_id}
89
- )
85
+ kbobj = KnowledgeBox(txn, storage, kbid)
86
+ await kbobj.create_vectorset(vectorset_config)
87
+ await txn.commit()
90
88
 
91
89
 
92
90
  def get_vectorset_config(
@@ -123,3 +121,57 @@ def get_vectorset_config(
123
121
  vectorset_index_config.normalize_vectors = False
124
122
  vectorset_config.vectorset_index_config.CopyFrom(vectorset_index_config)
125
123
  return vectorset_config
124
+
125
+
126
+ @api.delete(
127
+ f"/{KB_PREFIX}/{{kbid}}/vectorsets/{{vectorset_id}}",
128
+ status_code=204,
129
+ summary="Delete vector set from Knowledge Box",
130
+ tags=["VectorSets"],
131
+ # TODO: remove when the feature is mature
132
+ include_in_schema=False,
133
+ )
134
+ @requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.WRITER])
135
+ @version(1)
136
+ async def delete_vectorset(request: Request, kbid: str, vectorset_id: str) -> Response:
137
+ try:
138
+ await _delete_vectorset(kbid, vectorset_id)
139
+
140
+ except VectorSetConflict as exc:
141
+ raise HTTPException(
142
+ status_code=409,
143
+ detail=str(exc),
144
+ )
145
+
146
+ except learning_proxy.ProxiedLearningConfigError as err:
147
+ raise HTTPException(
148
+ status_code=err.status_code,
149
+ detail=err.content,
150
+ )
151
+
152
+ return Response(status_code=204)
153
+
154
+
155
+ async def _delete_vectorset(kbid: str, vectorset_id: str) -> None:
156
+ lconfig = await learning_proxy.get_configuration(kbid)
157
+ if lconfig is not None:
158
+ semantic_models = lconfig.model_dump()["semantic_models"]
159
+ if vectorset_id in semantic_models:
160
+ semantic_models.remove(vectorset_id)
161
+ await learning_proxy.update_configuration(kbid, {"semantic_models": semantic_models})
162
+
163
+ storage = await get_storage()
164
+ try:
165
+ async with datamanagers.with_rw_transaction() as txn:
166
+ kbobj = KnowledgeBox(txn, storage, kbid)
167
+ await kbobj.delete_vectorset(vectorset_id=vectorset_id)
168
+ await txn.commit()
169
+
170
+ except VectorSetConflict:
171
+ # caller should handle this error
172
+ raise
173
+ except Exception as ex:
174
+ errors.capture_exception(ex)
175
+ logger.exception(
176
+ "Could not delete vectorset from index", extra={"kbid": kbid, "vectorset_id": vectorset_id}
177
+ )