nucliadb-utils 5.0.1.post1091__py3-none-any.whl → 5.0.1.post1101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,7 +28,7 @@ from typing import Any, AsyncGenerator, Optional
28
28
  import backoff
29
29
  import httpx
30
30
 
31
- from nucliadb_telemetry.metrics import Observer
31
+ from nucliadb_telemetry.metrics import INF, Histogram, Observer
32
32
  from nucliadb_utils.aiopynecone.exceptions import (
33
33
  PineconeAPIError,
34
34
  PineconeRateLimitError,
@@ -46,6 +46,25 @@ from nucliadb_utils.aiopynecone.models import (
46
46
 
47
47
  logger = logging.getLogger(__name__)
48
48
 
49
+ upsert_batch_size_histogram = Histogram(
50
+ "pinecone_upsert_batch_size",
51
+ buckets=[10.0, 100.0, 200.0, 500.0, 1000.0, 5000.0, INF],
52
+ )
53
+ upsert_batch_count_histogram = Histogram(
54
+ "pinecone_upsert_batch_count",
55
+ buckets=[0.0, 1.0, 2.0, 3.0, 5.0, 10.0, 15.0, 20.0, 30.0, 50.0, INF],
56
+ )
57
+
58
+ delete_batch_size_histogram = Histogram(
59
+ "pinecone_delete_batch_size",
60
+ buckets=[1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 150.0, INF],
61
+ )
62
+
63
+ delete_batch_count_histogram = Histogram(
64
+ "pinecone_delete_batch_count",
65
+ buckets=[0.0, 1.0, 2.0, 3.0, 5.0, 10.0, 15.0, 20.0, 30.0, 50.0, INF],
66
+ )
67
+
49
68
 
50
69
  pinecone_observer = Observer(
51
70
  "pinecone_client",
@@ -57,7 +76,6 @@ pinecone_observer = Observer(
57
76
 
58
77
  DEFAULT_TIMEOUT = 30
59
78
  CONTROL_PLANE_BASE_URL = "https://api.pinecone.io/"
60
- INDEX_HOST_BASE_URL = "https://{index_host}/"
61
79
  BASE_API_HEADERS = {
62
80
  "Content-Type": "application/json",
63
81
  "Accept": "application/json",
@@ -198,6 +216,7 @@ class DataPlane:
198
216
  if len(vectors) == 0:
199
217
  # Nothing to upsert.
200
218
  return
219
+ upsert_batch_size_histogram.observe(len(vectors))
201
220
  headers = {"Api-Key": self.api_key}
202
221
  payload = UpsertRequest(vectors=vectors)
203
222
  post_kwargs: dict[str, Any] = {
@@ -261,7 +280,10 @@ class DataPlane:
261
280
  for batch in batchify(vectors, batch_size):
262
281
  tasks.append(asyncio.create_task(_upsert_batch(batch)))
263
282
 
264
- await asyncio.gather(*tasks)
283
+ upsert_batch_count_histogram.observe(len(tasks))
284
+
285
+ if len(tasks) > 0:
286
+ await asyncio.gather(*tasks)
265
287
 
266
288
  @backoff.on_exception(
267
289
  backoff.expo,
@@ -280,8 +302,16 @@ class DataPlane:
280
302
  """
281
303
  if len(ids) > MAX_DELETE_BATCH_SIZE:
282
304
  raise ValueError(f"Maximum number of ids in a single request is {MAX_DELETE_BATCH_SIZE}.")
305
+ if len(ids) == 0: # pragma: no cover
306
+ return
283
307
 
308
+ delete_batch_size_histogram.observe(len(ids))
284
309
  headers = {"Api-Key": self.api_key}
310
+
311
+ # This is a temporary log info to hunt down a bug.
312
+ rids = {vid.split("/")[0] for vid in ids}
313
+ logger.info(f"Deleting vectors from resources: {list(rids)}")
314
+
285
315
  payload = {"ids": ids}
286
316
  post_kwargs: dict[str, Any] = {
287
317
  "headers": headers,
@@ -428,7 +458,10 @@ class DataPlane:
428
458
  async for batch in async_batchify(async_iterable, batch_size):
429
459
  tasks.append(asyncio.create_task(_delete_batch(batch)))
430
460
 
431
- await asyncio.gather(*tasks)
461
+ delete_batch_count_histogram.observe(len(tasks))
462
+
463
+ if len(tasks) > 0:
464
+ await asyncio.gather(*tasks)
432
465
 
433
466
  @backoff.on_exception(
434
467
  backoff.expo,
@@ -516,8 +549,12 @@ class PineconeSession:
516
549
  if session is not None:
517
550
  return session
518
551
 
552
+ base_url = index_host
553
+ if not index_host.startswith("https://"):
554
+ base_url = f"https://{index_host}/"
555
+
519
556
  session = httpx.AsyncClient(
520
- base_url=INDEX_HOST_BASE_URL.format(index_host=index_host),
557
+ base_url=base_url,
521
558
  headers=BASE_API_HEADERS,
522
559
  timeout=DEFAULT_TIMEOUT,
523
560
  )
@@ -34,6 +34,7 @@ import aiohttp.client_exceptions
34
34
  import backoff
35
35
  import google.auth.transport.requests # type: ignore
36
36
  import yarl
37
+ from google.auth.exceptions import DefaultCredentialsError # type: ignore
37
38
  from google.oauth2 import service_account # type: ignore
38
39
 
39
40
  from nucliadb_protos.resources_pb2 import CloudFile
@@ -458,12 +459,25 @@ class GCSStorage(Storage):
458
459
  url: str = "https://www.googleapis.com",
459
460
  scopes: Optional[List[str]] = None,
460
461
  ):
461
- if account_credentials is not None:
462
+ if account_credentials is None:
463
+ self._json_credentials = None
464
+ elif isinstance(account_credentials, str) and account_credentials.strip() == "":
465
+ self._json_credentials = None
466
+ else:
462
467
  self._json_credentials = json.loads(base64.b64decode(account_credentials))
468
+
469
+ if self._json_credentials is not None:
463
470
  self._credentials = service_account.Credentials.from_service_account_info(
464
471
  self._json_credentials,
465
472
  scopes=DEFAULT_SCOPES if scopes is None else scopes,
466
473
  )
474
+ else:
475
+ try:
476
+ self._credentials, self._project = google.auth.default()
477
+ except DefaultCredentialsError:
478
+ logger.warning("Setting up without credentials as couldn't find workload identity")
479
+ self._credentials = None
480
+
467
481
  self.source = CloudFile.GCS
468
482
  self.deadletter_bucket = deadletter_bucket
469
483
  self.indexing_bucket = indexing_bucket
@@ -473,16 +487,15 @@ class GCSStorage(Storage):
473
487
  # https://cloud.google.com/storage/docs/bucket-locations
474
488
  self._bucket_labels = labels or {}
475
489
  self._executor = executor
476
- self._creation_access_token = datetime.now()
477
490
  self._upload_url = url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable" # noqa
478
491
  self.object_base_url = url + "/storage/v1/b"
479
492
  self._client = None
480
493
 
481
494
  def _get_access_token(self):
482
- if self._credentials.valid is False:
483
- req = google.auth.transport.requests.Request()
484
- self._credentials.refresh(req)
485
- self._creation_access_token = datetime.now()
495
+ if self._credentials.expired or self._credentials.valid is False:
496
+ request = google.auth.transport.requests.Request()
497
+ self._credentials.refresh(request)
498
+
486
499
  return self._credentials.token
487
500
 
488
501
  @storage_ops_observer.wrap({"type": "initialize"})
@@ -552,7 +565,9 @@ class GCSStorage(Storage):
552
565
  raise AttributeError()
553
566
 
554
567
  headers = await self.get_access_headers()
555
- url = f"{self.object_base_url}/{bucket_name}?project={self._project}"
568
+ # Using object access url instead of bucket access to avoid
569
+ # giving admin permission to the SA, needed to GET a bucket
570
+ url = f"{self.object_base_url}/{bucket_name}/o"
556
571
  async with self.session.get(
557
572
  url,
558
573
  headers=headers,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nucliadb_utils
3
- Version: 5.0.1.post1091
3
+ Version: 5.0.1.post1101
4
4
  Home-page: https://nuclia.com
5
5
  License: BSD
6
6
  Classifier: Development Status :: 4 - Beta
@@ -24,8 +24,8 @@ Requires-Dist: PyNaCl
24
24
  Requires-Dist: pyjwt>=2.4.0
25
25
  Requires-Dist: memorylru>=1.1.2
26
26
  Requires-Dist: mrflagly>=0.2.9
27
- Requires-Dist: nucliadb-protos>=5.0.1.post1091
28
- Requires-Dist: nucliadb-telemetry>=5.0.1.post1091
27
+ Requires-Dist: nucliadb-protos>=5.0.1.post1101
28
+ Requires-Dist: nucliadb-telemetry>=5.0.1.post1101
29
29
  Provides-Extra: cache
30
30
  Requires-Dist: redis>=4.3.4; extra == "cache"
31
31
  Requires-Dist: orjson>=3.6.7; extra == "cache"
@@ -18,7 +18,7 @@ nucliadb_utils/store.py,sha256=kQ35HemE0v4_Qg6xVqNIJi8vSFAYQtwI3rDtMsNy62Y,890
18
18
  nucliadb_utils/transaction.py,sha256=mwcI3aIHAvU5KOGqd_Uz_d1XQzXhk_-NWY8NqU1lfb0,7307
19
19
  nucliadb_utils/utilities.py,sha256=idajCm_4Sojh7b3HTkP0fTfG2Mb6PIB9xtMmcfB7Nl0,15758
20
20
  nucliadb_utils/aiopynecone/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
21
- nucliadb_utils/aiopynecone/client.py,sha256=EFAQbNp-OHZ55vyXWzdzzp_59FGacGs_nfM2sMBIG8A,21153
21
+ nucliadb_utils/aiopynecone/client.py,sha256=3DGJuHXO9Vm83atMLD4fd2FTz8SAr8RvPCGU_bXH9Ho,22333
22
22
  nucliadb_utils/aiopynecone/exceptions.py,sha256=EEE0XoGs1zIB5yOJ_fy6yoG4uIb4cWIawYdJeNe4eDo,3012
23
23
  nucliadb_utils/aiopynecone/models.py,sha256=ketK2IYLWiwFZ76rnJmwfcuopFJrCAtCUszdTSurm_Q,3236
24
24
  nucliadb_utils/audit/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
@@ -47,7 +47,7 @@ nucliadb_utils/nuclia_usage/utils/kb_usage_report.py,sha256=lTr9CMBpdk34KtkH5K8v
47
47
  nucliadb_utils/storages/__init__.py,sha256=5Qc8AUWiJv9_JbGCBpAn88AIJhwDlm0OPQpg2ZdRL4U,872
48
48
  nucliadb_utils/storages/azure.py,sha256=egMDwLNIGSQyVevuySt2AswzFdNAcih05BbRg3-p8IU,16015
49
49
  nucliadb_utils/storages/exceptions.py,sha256=mm_wX4YRtp7u7enkk_4pMSlX5AQQuFbq4xLmupVDt3Y,2502
50
- nucliadb_utils/storages/gcs.py,sha256=k1WXZDBQXSizO_3kEJef7EmyInnASJsAy5VTtEw8ly4,27020
50
+ nucliadb_utils/storages/gcs.py,sha256=TQlPpg9HzfHbOu_iCLPUo9KAql1IIjWxJ9mVJ9_CrV8,27658
51
51
  nucliadb_utils/storages/local.py,sha256=NxC_nMBd38NDsR266DSgoBLdQlvUwf0_sd50r-BLI0E,10288
52
52
  nucliadb_utils/storages/nuclia.py,sha256=vEv94xAT7QM2g80S25QyrOw2pzvP2BAX-ADgZLtuCVc,2097
53
53
  nucliadb_utils/storages/object_store.py,sha256=Tw10GmpYfM5TMqJ3Tk9pLQ9wLMBk1-snL_m6uasiZDQ,4257
@@ -64,8 +64,8 @@ nucliadb_utils/tests/indexing.py,sha256=YW2QhkhO9Q_8A4kKWJaWSvXvyQ_AiAwY1VylcfVQ
64
64
  nucliadb_utils/tests/local.py,sha256=7nuP8EFUAiA8ZH50R1iPV9EUXBySQxOanVm3Zht_e0g,1835
65
65
  nucliadb_utils/tests/nats.py,sha256=xqpww4jZjTKY9oPGlJdDJG67L3FIBQsa9qDHxILR8r8,7687
66
66
  nucliadb_utils/tests/s3.py,sha256=IdMxK_cNdSHLvO1u8BwsKFzD87Hk1MVPDZ57zx6h-rA,3656
67
- nucliadb_utils-5.0.1.post1091.dist-info/METADATA,sha256=36ObBdcAROkELutBroI5F89yEBH8CxZ5B2Gj90RA9_c,2071
68
- nucliadb_utils-5.0.1.post1091.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
69
- nucliadb_utils-5.0.1.post1091.dist-info/top_level.txt,sha256=fE3vJtALTfgh7bcAWcNhcfXkNPp_eVVpbKK-2IYua3E,15
70
- nucliadb_utils-5.0.1.post1091.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
71
- nucliadb_utils-5.0.1.post1091.dist-info/RECORD,,
67
+ nucliadb_utils-5.0.1.post1101.dist-info/METADATA,sha256=QPBsHrTmcgVbSx8pao1HXnknyhDjP0j8TWCjRvzYcOA,2071
68
+ nucliadb_utils-5.0.1.post1101.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
69
+ nucliadb_utils-5.0.1.post1101.dist-info/top_level.txt,sha256=fE3vJtALTfgh7bcAWcNhcfXkNPp_eVVpbKK-2IYua3E,15
70
+ nucliadb_utils-5.0.1.post1101.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
71
+ nucliadb_utils-5.0.1.post1101.dist-info/RECORD,,