nucliadb-utils 5.0.1.post1091__py3-none-any.whl → 5.0.1.post1101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb_utils/aiopynecone/client.py +42 -5
- nucliadb_utils/storages/gcs.py +22 -7
- {nucliadb_utils-5.0.1.post1091.dist-info → nucliadb_utils-5.0.1.post1101.dist-info}/METADATA +3 -3
- {nucliadb_utils-5.0.1.post1091.dist-info → nucliadb_utils-5.0.1.post1101.dist-info}/RECORD +7 -7
- {nucliadb_utils-5.0.1.post1091.dist-info → nucliadb_utils-5.0.1.post1101.dist-info}/WHEEL +0 -0
- {nucliadb_utils-5.0.1.post1091.dist-info → nucliadb_utils-5.0.1.post1101.dist-info}/top_level.txt +0 -0
- {nucliadb_utils-5.0.1.post1091.dist-info → nucliadb_utils-5.0.1.post1101.dist-info}/zip-safe +0 -0
@@ -28,7 +28,7 @@ from typing import Any, AsyncGenerator, Optional
|
|
28
28
|
import backoff
|
29
29
|
import httpx
|
30
30
|
|
31
|
-
from nucliadb_telemetry.metrics import Observer
|
31
|
+
from nucliadb_telemetry.metrics import INF, Histogram, Observer
|
32
32
|
from nucliadb_utils.aiopynecone.exceptions import (
|
33
33
|
PineconeAPIError,
|
34
34
|
PineconeRateLimitError,
|
@@ -46,6 +46,25 @@ from nucliadb_utils.aiopynecone.models import (
|
|
46
46
|
|
47
47
|
logger = logging.getLogger(__name__)
|
48
48
|
|
49
|
+
upsert_batch_size_histogram = Histogram(
|
50
|
+
"pinecone_upsert_batch_size",
|
51
|
+
buckets=[10.0, 100.0, 200.0, 500.0, 1000.0, 5000.0, INF],
|
52
|
+
)
|
53
|
+
upsert_batch_count_histogram = Histogram(
|
54
|
+
"pinecone_upsert_batch_count",
|
55
|
+
buckets=[0.0, 1.0, 2.0, 3.0, 5.0, 10.0, 15.0, 20.0, 30.0, 50.0, INF],
|
56
|
+
)
|
57
|
+
|
58
|
+
delete_batch_size_histogram = Histogram(
|
59
|
+
"pinecone_delete_batch_size",
|
60
|
+
buckets=[1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 150.0, INF],
|
61
|
+
)
|
62
|
+
|
63
|
+
delete_batch_count_histogram = Histogram(
|
64
|
+
"pinecone_delete_batch_count",
|
65
|
+
buckets=[0.0, 1.0, 2.0, 3.0, 5.0, 10.0, 15.0, 20.0, 30.0, 50.0, INF],
|
66
|
+
)
|
67
|
+
|
49
68
|
|
50
69
|
pinecone_observer = Observer(
|
51
70
|
"pinecone_client",
|
@@ -57,7 +76,6 @@ pinecone_observer = Observer(
|
|
57
76
|
|
58
77
|
DEFAULT_TIMEOUT = 30
|
59
78
|
CONTROL_PLANE_BASE_URL = "https://api.pinecone.io/"
|
60
|
-
INDEX_HOST_BASE_URL = "https://{index_host}/"
|
61
79
|
BASE_API_HEADERS = {
|
62
80
|
"Content-Type": "application/json",
|
63
81
|
"Accept": "application/json",
|
@@ -198,6 +216,7 @@ class DataPlane:
|
|
198
216
|
if len(vectors) == 0:
|
199
217
|
# Nothing to upsert.
|
200
218
|
return
|
219
|
+
upsert_batch_size_histogram.observe(len(vectors))
|
201
220
|
headers = {"Api-Key": self.api_key}
|
202
221
|
payload = UpsertRequest(vectors=vectors)
|
203
222
|
post_kwargs: dict[str, Any] = {
|
@@ -261,7 +280,10 @@ class DataPlane:
|
|
261
280
|
for batch in batchify(vectors, batch_size):
|
262
281
|
tasks.append(asyncio.create_task(_upsert_batch(batch)))
|
263
282
|
|
264
|
-
|
283
|
+
upsert_batch_count_histogram.observe(len(tasks))
|
284
|
+
|
285
|
+
if len(tasks) > 0:
|
286
|
+
await asyncio.gather(*tasks)
|
265
287
|
|
266
288
|
@backoff.on_exception(
|
267
289
|
backoff.expo,
|
@@ -280,8 +302,16 @@ class DataPlane:
|
|
280
302
|
"""
|
281
303
|
if len(ids) > MAX_DELETE_BATCH_SIZE:
|
282
304
|
raise ValueError(f"Maximum number of ids in a single request is {MAX_DELETE_BATCH_SIZE}.")
|
305
|
+
if len(ids) == 0: # pragma: no cover
|
306
|
+
return
|
283
307
|
|
308
|
+
delete_batch_size_histogram.observe(len(ids))
|
284
309
|
headers = {"Api-Key": self.api_key}
|
310
|
+
|
311
|
+
# This is a temporary log info to hunt down a bug.
|
312
|
+
rids = {vid.split("/")[0] for vid in ids}
|
313
|
+
logger.info(f"Deleting vectors from resources: {list(rids)}")
|
314
|
+
|
285
315
|
payload = {"ids": ids}
|
286
316
|
post_kwargs: dict[str, Any] = {
|
287
317
|
"headers": headers,
|
@@ -428,7 +458,10 @@ class DataPlane:
|
|
428
458
|
async for batch in async_batchify(async_iterable, batch_size):
|
429
459
|
tasks.append(asyncio.create_task(_delete_batch(batch)))
|
430
460
|
|
431
|
-
|
461
|
+
delete_batch_count_histogram.observe(len(tasks))
|
462
|
+
|
463
|
+
if len(tasks) > 0:
|
464
|
+
await asyncio.gather(*tasks)
|
432
465
|
|
433
466
|
@backoff.on_exception(
|
434
467
|
backoff.expo,
|
@@ -516,8 +549,12 @@ class PineconeSession:
|
|
516
549
|
if session is not None:
|
517
550
|
return session
|
518
551
|
|
552
|
+
base_url = index_host
|
553
|
+
if not index_host.startswith("https://"):
|
554
|
+
base_url = f"https://{index_host}/"
|
555
|
+
|
519
556
|
session = httpx.AsyncClient(
|
520
|
-
base_url=
|
557
|
+
base_url=base_url,
|
521
558
|
headers=BASE_API_HEADERS,
|
522
559
|
timeout=DEFAULT_TIMEOUT,
|
523
560
|
)
|
nucliadb_utils/storages/gcs.py
CHANGED
@@ -34,6 +34,7 @@ import aiohttp.client_exceptions
|
|
34
34
|
import backoff
|
35
35
|
import google.auth.transport.requests # type: ignore
|
36
36
|
import yarl
|
37
|
+
from google.auth.exceptions import DefaultCredentialsError # type: ignore
|
37
38
|
from google.oauth2 import service_account # type: ignore
|
38
39
|
|
39
40
|
from nucliadb_protos.resources_pb2 import CloudFile
|
@@ -458,12 +459,25 @@ class GCSStorage(Storage):
|
|
458
459
|
url: str = "https://www.googleapis.com",
|
459
460
|
scopes: Optional[List[str]] = None,
|
460
461
|
):
|
461
|
-
if account_credentials is
|
462
|
+
if account_credentials is None:
|
463
|
+
self._json_credentials = None
|
464
|
+
elif isinstance(account_credentials, str) and account_credentials.strip() == "":
|
465
|
+
self._json_credentials = None
|
466
|
+
else:
|
462
467
|
self._json_credentials = json.loads(base64.b64decode(account_credentials))
|
468
|
+
|
469
|
+
if self._json_credentials is not None:
|
463
470
|
self._credentials = service_account.Credentials.from_service_account_info(
|
464
471
|
self._json_credentials,
|
465
472
|
scopes=DEFAULT_SCOPES if scopes is None else scopes,
|
466
473
|
)
|
474
|
+
else:
|
475
|
+
try:
|
476
|
+
self._credentials, self._project = google.auth.default()
|
477
|
+
except DefaultCredentialsError:
|
478
|
+
logger.warning("Setting up without credentials as couldn't find workload identity")
|
479
|
+
self._credentials = None
|
480
|
+
|
467
481
|
self.source = CloudFile.GCS
|
468
482
|
self.deadletter_bucket = deadletter_bucket
|
469
483
|
self.indexing_bucket = indexing_bucket
|
@@ -473,16 +487,15 @@ class GCSStorage(Storage):
|
|
473
487
|
# https://cloud.google.com/storage/docs/bucket-locations
|
474
488
|
self._bucket_labels = labels or {}
|
475
489
|
self._executor = executor
|
476
|
-
self._creation_access_token = datetime.now()
|
477
490
|
self._upload_url = url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable" # noqa
|
478
491
|
self.object_base_url = url + "/storage/v1/b"
|
479
492
|
self._client = None
|
480
493
|
|
481
494
|
def _get_access_token(self):
|
482
|
-
if self._credentials.valid is False:
|
483
|
-
|
484
|
-
self._credentials.refresh(
|
485
|
-
|
495
|
+
if self._credentials.expired or self._credentials.valid is False:
|
496
|
+
request = google.auth.transport.requests.Request()
|
497
|
+
self._credentials.refresh(request)
|
498
|
+
|
486
499
|
return self._credentials.token
|
487
500
|
|
488
501
|
@storage_ops_observer.wrap({"type": "initialize"})
|
@@ -552,7 +565,9 @@ class GCSStorage(Storage):
|
|
552
565
|
raise AttributeError()
|
553
566
|
|
554
567
|
headers = await self.get_access_headers()
|
555
|
-
url
|
568
|
+
# Using object access url instead of bucket access to avoid
|
569
|
+
# giving admin permission to the SA, needed to GET a bucket
|
570
|
+
url = f"{self.object_base_url}/{bucket_name}/o"
|
556
571
|
async with self.session.get(
|
557
572
|
url,
|
558
573
|
headers=headers,
|
{nucliadb_utils-5.0.1.post1091.dist-info → nucliadb_utils-5.0.1.post1101.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: nucliadb_utils
|
3
|
-
Version: 5.0.1.
|
3
|
+
Version: 5.0.1.post1101
|
4
4
|
Home-page: https://nuclia.com
|
5
5
|
License: BSD
|
6
6
|
Classifier: Development Status :: 4 - Beta
|
@@ -24,8 +24,8 @@ Requires-Dist: PyNaCl
|
|
24
24
|
Requires-Dist: pyjwt>=2.4.0
|
25
25
|
Requires-Dist: memorylru>=1.1.2
|
26
26
|
Requires-Dist: mrflagly>=0.2.9
|
27
|
-
Requires-Dist: nucliadb-protos>=5.0.1.
|
28
|
-
Requires-Dist: nucliadb-telemetry>=5.0.1.
|
27
|
+
Requires-Dist: nucliadb-protos>=5.0.1.post1101
|
28
|
+
Requires-Dist: nucliadb-telemetry>=5.0.1.post1101
|
29
29
|
Provides-Extra: cache
|
30
30
|
Requires-Dist: redis>=4.3.4; extra == "cache"
|
31
31
|
Requires-Dist: orjson>=3.6.7; extra == "cache"
|
@@ -18,7 +18,7 @@ nucliadb_utils/store.py,sha256=kQ35HemE0v4_Qg6xVqNIJi8vSFAYQtwI3rDtMsNy62Y,890
|
|
18
18
|
nucliadb_utils/transaction.py,sha256=mwcI3aIHAvU5KOGqd_Uz_d1XQzXhk_-NWY8NqU1lfb0,7307
|
19
19
|
nucliadb_utils/utilities.py,sha256=idajCm_4Sojh7b3HTkP0fTfG2Mb6PIB9xtMmcfB7Nl0,15758
|
20
20
|
nucliadb_utils/aiopynecone/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
21
|
-
nucliadb_utils/aiopynecone/client.py,sha256=
|
21
|
+
nucliadb_utils/aiopynecone/client.py,sha256=3DGJuHXO9Vm83atMLD4fd2FTz8SAr8RvPCGU_bXH9Ho,22333
|
22
22
|
nucliadb_utils/aiopynecone/exceptions.py,sha256=EEE0XoGs1zIB5yOJ_fy6yoG4uIb4cWIawYdJeNe4eDo,3012
|
23
23
|
nucliadb_utils/aiopynecone/models.py,sha256=ketK2IYLWiwFZ76rnJmwfcuopFJrCAtCUszdTSurm_Q,3236
|
24
24
|
nucliadb_utils/audit/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
@@ -47,7 +47,7 @@ nucliadb_utils/nuclia_usage/utils/kb_usage_report.py,sha256=lTr9CMBpdk34KtkH5K8v
|
|
47
47
|
nucliadb_utils/storages/__init__.py,sha256=5Qc8AUWiJv9_JbGCBpAn88AIJhwDlm0OPQpg2ZdRL4U,872
|
48
48
|
nucliadb_utils/storages/azure.py,sha256=egMDwLNIGSQyVevuySt2AswzFdNAcih05BbRg3-p8IU,16015
|
49
49
|
nucliadb_utils/storages/exceptions.py,sha256=mm_wX4YRtp7u7enkk_4pMSlX5AQQuFbq4xLmupVDt3Y,2502
|
50
|
-
nucliadb_utils/storages/gcs.py,sha256=
|
50
|
+
nucliadb_utils/storages/gcs.py,sha256=TQlPpg9HzfHbOu_iCLPUo9KAql1IIjWxJ9mVJ9_CrV8,27658
|
51
51
|
nucliadb_utils/storages/local.py,sha256=NxC_nMBd38NDsR266DSgoBLdQlvUwf0_sd50r-BLI0E,10288
|
52
52
|
nucliadb_utils/storages/nuclia.py,sha256=vEv94xAT7QM2g80S25QyrOw2pzvP2BAX-ADgZLtuCVc,2097
|
53
53
|
nucliadb_utils/storages/object_store.py,sha256=Tw10GmpYfM5TMqJ3Tk9pLQ9wLMBk1-snL_m6uasiZDQ,4257
|
@@ -64,8 +64,8 @@ nucliadb_utils/tests/indexing.py,sha256=YW2QhkhO9Q_8A4kKWJaWSvXvyQ_AiAwY1VylcfVQ
|
|
64
64
|
nucliadb_utils/tests/local.py,sha256=7nuP8EFUAiA8ZH50R1iPV9EUXBySQxOanVm3Zht_e0g,1835
|
65
65
|
nucliadb_utils/tests/nats.py,sha256=xqpww4jZjTKY9oPGlJdDJG67L3FIBQsa9qDHxILR8r8,7687
|
66
66
|
nucliadb_utils/tests/s3.py,sha256=IdMxK_cNdSHLvO1u8BwsKFzD87Hk1MVPDZ57zx6h-rA,3656
|
67
|
-
nucliadb_utils-5.0.1.
|
68
|
-
nucliadb_utils-5.0.1.
|
69
|
-
nucliadb_utils-5.0.1.
|
70
|
-
nucliadb_utils-5.0.1.
|
71
|
-
nucliadb_utils-5.0.1.
|
67
|
+
nucliadb_utils-5.0.1.post1101.dist-info/METADATA,sha256=QPBsHrTmcgVbSx8pao1HXnknyhDjP0j8TWCjRvzYcOA,2071
|
68
|
+
nucliadb_utils-5.0.1.post1101.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
69
|
+
nucliadb_utils-5.0.1.post1101.dist-info/top_level.txt,sha256=fE3vJtALTfgh7bcAWcNhcfXkNPp_eVVpbKK-2IYua3E,15
|
70
|
+
nucliadb_utils-5.0.1.post1101.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
71
|
+
nucliadb_utils-5.0.1.post1101.dist-info/RECORD,,
|
File without changes
|
{nucliadb_utils-5.0.1.post1091.dist-info → nucliadb_utils-5.0.1.post1101.dist-info}/top_level.txt
RENAMED
File without changes
|
{nucliadb_utils-5.0.1.post1091.dist-info → nucliadb_utils-5.0.1.post1101.dist-info}/zip-safe
RENAMED
File without changes
|