futurehouse-client 0.4.2.dev274__py3-none-any.whl → 0.4.3.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- futurehouse_client/clients/data_storage_methods.py +781 -128
- futurehouse_client/clients/rest_client.py +110 -40
- futurehouse_client/models/client.py +5 -1
- futurehouse_client/models/data_storage_methods.py +24 -10
- futurehouse_client/models/rest.py +39 -7
- futurehouse_client/utils/general.py +35 -6
- futurehouse_client/utils/world_model_tools.py +21 -2
- futurehouse_client/version.py +2 -2
- {futurehouse_client-0.4.2.dev274.dist-info → futurehouse_client-0.4.3.dev3.dist-info}/METADATA +3 -1
- futurehouse_client-0.4.3.dev3.dist-info/RECORD +23 -0
- futurehouse_client-0.4.2.dev274.dist-info/RECORD +0 -23
- {futurehouse_client-0.4.2.dev274.dist-info → futurehouse_client-0.4.3.dev3.dist-info}/WHEEL +0 -0
- {futurehouse_client-0.4.2.dev274.dist-info → futurehouse_client-0.4.3.dev3.dist-info}/licenses/LICENSE +0 -0
- {futurehouse_client-0.4.2.dev274.dist-info → futurehouse_client-0.4.3.dev3.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ import tempfile
|
|
7
7
|
import zipfile
|
8
8
|
from os import PathLike
|
9
9
|
from pathlib import Path
|
10
|
-
from typing import NoReturn
|
10
|
+
from typing import Any, NoReturn
|
11
11
|
from uuid import UUID
|
12
12
|
|
13
13
|
import aiofiles
|
@@ -15,6 +15,8 @@ import aiohttp
|
|
15
15
|
import requests as requests_lib
|
16
16
|
from google.resumable_media import requests as resumable_requests
|
17
17
|
from httpx import AsyncClient, Client, HTTPStatusError, codes
|
18
|
+
from lmi.utils import gather_with_concurrency
|
19
|
+
from pydantic import HttpUrl
|
18
20
|
from requests.adapters import HTTPAdapter
|
19
21
|
from tenacity import (
|
20
22
|
before_sleep_log,
|
@@ -26,12 +28,19 @@ from tqdm import tqdm
|
|
26
28
|
from urllib3.util.retry import Retry
|
27
29
|
|
28
30
|
from futurehouse_client.models.data_storage_methods import (
|
31
|
+
CreateDatasetPayload,
|
32
|
+
DataContentType,
|
29
33
|
DataStorageLocationPayload,
|
30
34
|
DataStorageRequestPayload,
|
31
35
|
DataStorageResponse,
|
36
|
+
DataStorageType,
|
32
37
|
DirectoryManifest,
|
33
38
|
ManifestEntry,
|
34
39
|
)
|
40
|
+
from futurehouse_client.models.rest import (
|
41
|
+
DataStorageSearchPayload,
|
42
|
+
SearchCriterion,
|
43
|
+
)
|
35
44
|
from futurehouse_client.utils.general import retry_if_connection_error
|
36
45
|
|
37
46
|
# this is only required if they're using a yaml manifest
|
@@ -54,6 +63,7 @@ INITIATE_HEADERS = {
|
|
54
63
|
"x-goog-resumable": "start",
|
55
64
|
"Content-Length": "0",
|
56
65
|
}
|
66
|
+
DOWNLOAD_CONCURRENCY = 3
|
57
67
|
|
58
68
|
|
59
69
|
def _should_ignore_file(
|
@@ -438,6 +448,10 @@ class DataStorageCreationError(DataStorageError):
|
|
438
448
|
"""Raised when there's an error creating a data storage entry."""
|
439
449
|
|
440
450
|
|
451
|
+
class DataStorageRetrievalError(DataStorageError):
|
452
|
+
"""Raised when there's an error retrieving a data storage entry."""
|
453
|
+
|
454
|
+
|
441
455
|
class ProgressWrapper:
|
442
456
|
"""Common progress wrapper for file uploads."""
|
443
457
|
|
@@ -462,7 +476,7 @@ class ProgressWrapper:
|
|
462
476
|
return self.file_obj.tell()
|
463
477
|
|
464
478
|
|
465
|
-
class DataStorageMethods:
|
479
|
+
class DataStorageMethods: # pylint: disable=too-many-public-methods
|
466
480
|
"""Data storage methods for RestClient.
|
467
481
|
|
468
482
|
This class contains methods for interacting with the data storage API endpoints.
|
@@ -472,14 +486,12 @@ class DataStorageMethods:
|
|
472
486
|
def _handle_http_errors(self, e: HTTPStatusError, operation: str) -> NoReturn:
|
473
487
|
"""Handle common HTTP errors for data storage operations."""
|
474
488
|
if e.response.status_code == codes.FORBIDDEN:
|
475
|
-
raise
|
489
|
+
raise DataStorageError(
|
476
490
|
f"Error {operation} data storage entry, not authorized"
|
477
491
|
) from e
|
478
492
|
if e.response.status_code == codes.UNPROCESSABLE_ENTITY:
|
479
|
-
raise
|
480
|
-
|
481
|
-
) from e
|
482
|
-
raise DataStorageCreationError(
|
493
|
+
raise DataStorageError(f"Invalid request payload: {e.response.text}") from e
|
494
|
+
raise DataStorageError(
|
483
495
|
f"Error {operation} data storage entry: {e.response.status_code} - {e.response.text}"
|
484
496
|
) from e
|
485
497
|
|
@@ -487,7 +499,7 @@ class DataStorageMethods:
|
|
487
499
|
"""Validate file path exists and return Path object."""
|
488
500
|
file_path = Path(file_path)
|
489
501
|
if not file_path.exists():
|
490
|
-
raise
|
502
|
+
raise DataStorageError(f"File or directory not found: {file_path}")
|
491
503
|
return file_path
|
492
504
|
|
493
505
|
def _build_zip_path(self, name: str, path: str | None) -> str:
|
@@ -529,19 +541,24 @@ class DataStorageMethods:
|
|
529
541
|
return extracted_items[0]
|
530
542
|
return extract_dir
|
531
543
|
|
532
|
-
async def _adownload_from_gcs(
|
544
|
+
async def _adownload_from_gcs(
|
545
|
+
self, signed_url: str, file_name: str | None = None
|
546
|
+
) -> Path:
|
533
547
|
"""Download file from GCS using signed URL and handle unzipping if needed.
|
534
548
|
|
535
549
|
Args:
|
536
550
|
signed_url: The signed URL to download from
|
551
|
+
file_name: The name of the file to download
|
537
552
|
|
538
553
|
Returns:
|
539
554
|
Path to the downloaded file (or unzipped directory if it was a zip)
|
540
555
|
"""
|
556
|
+
file_name = file_name or "downloaded_file"
|
557
|
+
|
541
558
|
try:
|
542
559
|
with tempfile.TemporaryDirectory() as temp_dir_str:
|
543
560
|
temp_dir = Path(temp_dir_str)
|
544
|
-
temp_file = temp_dir /
|
561
|
+
temp_file = temp_dir / file_name
|
545
562
|
|
546
563
|
async with self.async_client.stream("GET", signed_url) as response:
|
547
564
|
response.raise_for_status()
|
@@ -549,11 +566,11 @@ class DataStorageMethods:
|
|
549
566
|
content_disposition = response.headers.get(
|
550
567
|
"content-disposition", ""
|
551
568
|
)
|
552
|
-
filename =
|
569
|
+
filename = file_name
|
553
570
|
if "filename=" in content_disposition:
|
554
571
|
filename = content_disposition.split("filename=")[-1].strip('"')
|
555
572
|
|
556
|
-
if filename !=
|
573
|
+
if filename != file_name:
|
557
574
|
temp_file = temp_dir / filename
|
558
575
|
|
559
576
|
async with aiofiles.open(temp_file, "wb") as f:
|
@@ -583,21 +600,23 @@ class DataStorageMethods:
|
|
583
600
|
return final_file
|
584
601
|
|
585
602
|
except Exception as e:
|
586
|
-
raise
|
603
|
+
raise DataStorageError(f"Failed to download from GCS: {e}") from e
|
587
604
|
|
588
|
-
def _download_from_gcs(self, signed_url: str) -> Path:
|
605
|
+
def _download_from_gcs(self, signed_url: str, file_name: str | None = None) -> Path:
|
589
606
|
"""Download file from GCS using signed URL and handle unzipping if needed (sync version).
|
590
607
|
|
591
608
|
Args:
|
592
609
|
signed_url: The signed URL to download from
|
593
|
-
|
610
|
+
file_name: The name of the file to download
|
594
611
|
Returns:
|
595
612
|
Path to the downloaded file (or unzipped directory if it was a zip)
|
596
613
|
"""
|
614
|
+
file_name = file_name or "downloaded_file"
|
615
|
+
|
597
616
|
try:
|
598
617
|
with tempfile.TemporaryDirectory() as temp_dir_str:
|
599
618
|
temp_dir = Path(temp_dir_str)
|
600
|
-
temp_file = temp_dir /
|
619
|
+
temp_file = temp_dir / file_name
|
601
620
|
|
602
621
|
with requests_lib.get(signed_url, stream=True, timeout=30) as response:
|
603
622
|
response.raise_for_status()
|
@@ -605,11 +624,11 @@ class DataStorageMethods:
|
|
605
624
|
content_disposition = response.headers.get(
|
606
625
|
"content-disposition", ""
|
607
626
|
)
|
608
|
-
filename =
|
627
|
+
filename = file_name
|
609
628
|
if "filename=" in content_disposition:
|
610
629
|
filename = content_disposition.split("filename=")[-1].strip('"')
|
611
630
|
|
612
|
-
if filename !=
|
631
|
+
if filename != file_name:
|
613
632
|
temp_file = temp_dir / filename
|
614
633
|
|
615
634
|
with open(temp_file, "wb") as f:
|
@@ -639,9 +658,7 @@ class DataStorageMethods:
|
|
639
658
|
return final_file
|
640
659
|
|
641
660
|
except Exception as e:
|
642
|
-
raise
|
643
|
-
|
644
|
-
# =====================================
|
661
|
+
raise DataStorageError(f"Failed to download from GCS: {e}") from e
|
645
662
|
|
646
663
|
def _prepare_single_file_upload(
|
647
664
|
self, name: str, file_path: Path, description: str | None, path: str | None
|
@@ -676,7 +693,7 @@ class DataStorageMethods:
|
|
676
693
|
) -> DataStorageResponse:
|
677
694
|
"""Create data storage entry via API (sync version)."""
|
678
695
|
response = self.client.post(
|
679
|
-
"/v0.1/data-storage",
|
696
|
+
"/v0.1/data-storage/data-entries",
|
680
697
|
json=payload.model_dump(mode="json", exclude_none=True),
|
681
698
|
)
|
682
699
|
response.raise_for_status()
|
@@ -687,7 +704,7 @@ class DataStorageMethods:
|
|
687
704
|
) -> DataStorageResponse:
|
688
705
|
"""Create data storage entry via API (async version)."""
|
689
706
|
response = await self.async_client.post(
|
690
|
-
"/v0.1/data-storage",
|
707
|
+
"/v0.1/data-storage/data-entries",
|
691
708
|
json=payload.model_dump(mode="json", exclude_none=True),
|
692
709
|
)
|
693
710
|
response.raise_for_status()
|
@@ -800,24 +817,30 @@ class DataStorageMethods:
|
|
800
817
|
)
|
801
818
|
data_storage_response = self._create_data_storage_entry(payload)
|
802
819
|
|
803
|
-
|
804
|
-
|
820
|
+
for storage_location in data_storage_response.storage_locations:
|
821
|
+
if not storage_location.storage_config.signed_url:
|
822
|
+
raise DataStorageCreationError(
|
823
|
+
"No signed URL returned for zip upload"
|
824
|
+
)
|
805
825
|
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
826
|
+
with tqdm(
|
827
|
+
total=zip_size,
|
828
|
+
unit="B",
|
829
|
+
unit_scale=True,
|
830
|
+
unit_divisor=1024,
|
831
|
+
desc=f"Uploading {dir_path.name} (zipped)",
|
832
|
+
miniters=1,
|
833
|
+
mininterval=0.1,
|
834
|
+
) as pbar:
|
835
|
+
_upload_file_with_progress(
|
836
|
+
storage_location.storage_config.signed_url,
|
837
|
+
temp_zip_path,
|
838
|
+
pbar,
|
839
|
+
zip_size,
|
840
|
+
)
|
818
841
|
|
819
842
|
status_response = self.client.patch(
|
820
|
-
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
843
|
+
f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
|
821
844
|
json={"status": "active"},
|
822
845
|
)
|
823
846
|
status_response.raise_for_status()
|
@@ -871,24 +894,30 @@ class DataStorageMethods:
|
|
871
894
|
|
872
895
|
data_storage_response = await self._acreate_data_storage_entry(payload)
|
873
896
|
|
874
|
-
|
875
|
-
|
897
|
+
for storage_location in data_storage_response.storage_locations:
|
898
|
+
if not storage_location.storage_config.signed_url:
|
899
|
+
raise DataStorageCreationError(
|
900
|
+
"No signed URL returned for zip upload"
|
901
|
+
)
|
876
902
|
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
903
|
+
with tqdm(
|
904
|
+
total=zip_size,
|
905
|
+
unit="B",
|
906
|
+
unit_scale=True,
|
907
|
+
unit_divisor=1024,
|
908
|
+
desc=f"Uploading {dir_path.name} (zipped)",
|
909
|
+
miniters=1,
|
910
|
+
mininterval=0.1,
|
911
|
+
) as pbar:
|
912
|
+
await _aupload_file_with_progress(
|
913
|
+
storage_location.storage_config.signed_url,
|
914
|
+
temp_zip_path,
|
915
|
+
pbar,
|
916
|
+
zip_size,
|
917
|
+
)
|
889
918
|
|
890
919
|
status_response = await self.async_client.patch(
|
891
|
-
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
920
|
+
f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
|
892
921
|
json={"status": "active"},
|
893
922
|
)
|
894
923
|
status_response.raise_for_status()
|
@@ -951,30 +980,34 @@ class DataStorageMethods:
|
|
951
980
|
|
952
981
|
data_storage_response = self._create_data_storage_entry(payload)
|
953
982
|
|
954
|
-
|
955
|
-
|
983
|
+
for storage_location in data_storage_response.storage_locations:
|
984
|
+
if not storage_location.storage_config.signed_url:
|
985
|
+
raise DataStorageCreationError("No signed URL returned from server")
|
956
986
|
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
987
|
+
with tqdm(
|
988
|
+
total=file_size,
|
989
|
+
unit="B",
|
990
|
+
unit_scale=True,
|
991
|
+
unit_divisor=1024,
|
992
|
+
desc=f"Uploading {file_path.name}",
|
993
|
+
miniters=1,
|
994
|
+
mininterval=0.1,
|
995
|
+
) as pbar:
|
996
|
+
try:
|
997
|
+
_upload_file_with_progress(
|
998
|
+
storage_location.storage_config.signed_url,
|
999
|
+
file_path,
|
1000
|
+
pbar,
|
1001
|
+
file_size,
|
1002
|
+
)
|
1003
|
+
logger.debug("File upload to signed URL completed successfully")
|
1004
|
+
except Exception as e:
|
1005
|
+
logger.error(f"Failed to upload file to signed URL: {e}")
|
1006
|
+
raise
|
974
1007
|
|
975
1008
|
logger.debug("Updating data storage status to active")
|
976
1009
|
status_response = self.client.patch(
|
977
|
-
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
1010
|
+
f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
|
978
1011
|
json={"status": "active"},
|
979
1012
|
)
|
980
1013
|
status_response.raise_for_status()
|
@@ -1015,24 +1048,28 @@ class DataStorageMethods:
|
|
1015
1048
|
|
1016
1049
|
data_storage_response = await self._acreate_data_storage_entry(payload)
|
1017
1050
|
|
1018
|
-
|
1019
|
-
|
1051
|
+
for location in data_storage_response.storage_locations:
|
1052
|
+
if not location.storage_config.signed_url:
|
1053
|
+
raise DataStorageCreationError(
|
1054
|
+
f"No signed URL returned from server for location: {location.id}"
|
1055
|
+
)
|
1020
1056
|
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1057
|
+
with tqdm(
|
1058
|
+
total=file_size,
|
1059
|
+
unit="B",
|
1060
|
+
unit_scale=True,
|
1061
|
+
unit_divisor=1024,
|
1062
|
+
desc=f"Uploading {file_path.name}",
|
1063
|
+
miniters=1,
|
1064
|
+
mininterval=0.1,
|
1065
|
+
leave=False,
|
1066
|
+
) as pbar:
|
1067
|
+
await _aupload_file_with_progress(
|
1068
|
+
location.storage_config.signed_url, file_path, pbar, file_size
|
1069
|
+
)
|
1033
1070
|
|
1034
1071
|
status_response = await self.async_client.patch(
|
1035
|
-
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
1072
|
+
f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
|
1036
1073
|
json={"status": "active"},
|
1037
1074
|
)
|
1038
1075
|
status_response.raise_for_status()
|
@@ -1075,25 +1112,26 @@ class DataStorageMethods:
|
|
1075
1112
|
)
|
1076
1113
|
data_storage_response = self._create_data_storage_entry(payload)
|
1077
1114
|
|
1078
|
-
|
1079
|
-
|
1115
|
+
for location in data_storage_response.storage_locations:
|
1116
|
+
if not location.storage_config.signed_url:
|
1117
|
+
raise DataStorageCreationError("No signed URL returned from server")
|
1080
1118
|
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1119
|
+
with tqdm(
|
1120
|
+
total=file_size,
|
1121
|
+
unit="B",
|
1122
|
+
unit_scale=True,
|
1123
|
+
unit_divisor=1024,
|
1124
|
+
desc=f"Uploading {file_path.name}",
|
1125
|
+
miniters=1,
|
1126
|
+
mininterval=0.1,
|
1127
|
+
leave=False,
|
1128
|
+
) as pbar:
|
1129
|
+
_upload_file_with_progress(
|
1130
|
+
location.storage_config.signed_url, file_path, pbar, file_size
|
1131
|
+
)
|
1094
1132
|
|
1095
1133
|
status_response = self.client.patch(
|
1096
|
-
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
1134
|
+
f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
|
1097
1135
|
json={"status": "active"},
|
1098
1136
|
)
|
1099
1137
|
status_response.raise_for_status()
|
@@ -1295,7 +1333,9 @@ class DataStorageMethods:
|
|
1295
1333
|
)
|
1296
1334
|
data_storage_response = await self._acreate_data_storage_entry(payload)
|
1297
1335
|
|
1298
|
-
|
1336
|
+
storage_location = data_storage_response.storage_locations[0]
|
1337
|
+
|
1338
|
+
if not storage_location.storage_config.signed_url:
|
1299
1339
|
raise DataStorageCreationError("No signed URL returned from server")
|
1300
1340
|
|
1301
1341
|
with tqdm(
|
@@ -1308,11 +1348,11 @@ class DataStorageMethods:
|
|
1308
1348
|
mininterval=0.1,
|
1309
1349
|
) as pbar:
|
1310
1350
|
await _aupload_file_with_progress(
|
1311
|
-
|
1351
|
+
storage_location.storage_config.signed_url, file_path, pbar, file_size
|
1312
1352
|
)
|
1313
1353
|
|
1314
1354
|
status_response = await self.async_client.patch(
|
1315
|
-
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
1355
|
+
f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
|
1316
1356
|
json={"status": "active"},
|
1317
1357
|
)
|
1318
1358
|
status_response.raise_for_status()
|
@@ -1553,6 +1593,130 @@ class DataStorageMethods:
|
|
1553
1593
|
f"An unexpected error occurred: {e!r}"
|
1554
1594
|
) from e
|
1555
1595
|
|
1596
|
+
@retry(
|
1597
|
+
stop=stop_after_attempt(3),
|
1598
|
+
wait=wait_exponential(multiplier=1, max=10),
|
1599
|
+
retry=retry_if_connection_error,
|
1600
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
1601
|
+
)
|
1602
|
+
async def astore_link(
|
1603
|
+
self,
|
1604
|
+
name: str,
|
1605
|
+
url: HttpUrl,
|
1606
|
+
description: str,
|
1607
|
+
instructions: str,
|
1608
|
+
api_key: str | None = None,
|
1609
|
+
metadata: dict[str, Any] | None = None,
|
1610
|
+
dataset_id: UUID | None = None,
|
1611
|
+
project_id: UUID | None = None,
|
1612
|
+
) -> DataStorageResponse:
|
1613
|
+
"""Asynchronously store a link/URL in the data storage system.
|
1614
|
+
|
1615
|
+
Args:
|
1616
|
+
name: Name of the link entry
|
1617
|
+
url: The URL/link to store
|
1618
|
+
description: Searchable details of the link
|
1619
|
+
instructions: Instructions for how to consume the link or api
|
1620
|
+
api_key: Any authentication key to access the api. If this is included, you should also include
|
1621
|
+
details of how the key should be consumed in the instructions.
|
1622
|
+
metadata: Any additional metadata about the link
|
1623
|
+
dataset_id: Optional dataset ID to add entry to, or None to create new dataset
|
1624
|
+
project_id: ID of the project this data storage entry belongs to
|
1625
|
+
|
1626
|
+
Returns:
|
1627
|
+
DataStorageResponse containing the created link storage entry
|
1628
|
+
|
1629
|
+
Raises:
|
1630
|
+
DataStorageCreationError: If there's an error creating the link storage entry
|
1631
|
+
"""
|
1632
|
+
try:
|
1633
|
+
link_metadata = metadata.copy() if metadata else {}
|
1634
|
+
link_metadata["instructions"] = instructions
|
1635
|
+
if api_key:
|
1636
|
+
link_metadata["api_key"] = api_key
|
1637
|
+
|
1638
|
+
existing_location = DataStorageLocationPayload(
|
1639
|
+
storage_type=DataStorageType.LINK,
|
1640
|
+
content_type=DataContentType.TEXT,
|
1641
|
+
location=url,
|
1642
|
+
metadata=link_metadata or None,
|
1643
|
+
)
|
1644
|
+
|
1645
|
+
payload = DataStorageRequestPayload(
|
1646
|
+
name=name,
|
1647
|
+
content=url,
|
1648
|
+
description=description,
|
1649
|
+
dataset_id=dataset_id,
|
1650
|
+
project_id=project_id,
|
1651
|
+
existing_location=existing_location,
|
1652
|
+
)
|
1653
|
+
return await self._acreate_data_storage_entry(payload)
|
1654
|
+
except HTTPStatusError as e:
|
1655
|
+
self._handle_http_errors(e, "creating")
|
1656
|
+
except Exception as e:
|
1657
|
+
raise DataStorageCreationError(
|
1658
|
+
f"An unexpected error occurred: {e!r}"
|
1659
|
+
) from e
|
1660
|
+
|
1661
|
+
def store_link(
|
1662
|
+
self,
|
1663
|
+
name: str,
|
1664
|
+
url: HttpUrl,
|
1665
|
+
description: str,
|
1666
|
+
instructions: str,
|
1667
|
+
api_key: str | None = None,
|
1668
|
+
metadata: dict[str, Any] | None = None,
|
1669
|
+
dataset_id: UUID | None = None,
|
1670
|
+
project_id: UUID | None = None,
|
1671
|
+
) -> DataStorageResponse:
|
1672
|
+
"""Store a link/URL in the data storage system.
|
1673
|
+
|
1674
|
+
Args:
|
1675
|
+
name: Name of the link entry
|
1676
|
+
url: The URL/link to store
|
1677
|
+
description: Searchable details of the link
|
1678
|
+
instructions: Instructions for how to consume the link or api
|
1679
|
+
api_key: Any authentication key to access the api. If this is included, you should also include
|
1680
|
+
details of how the key should be consumed in the instructions.
|
1681
|
+
metadata: Any additional metadata about the link
|
1682
|
+
dataset_id: Optional dataset ID to add entry to, or None to create new dataset
|
1683
|
+
project_id: ID of the project this data storage entry belongs to
|
1684
|
+
|
1685
|
+
Returns:
|
1686
|
+
DataStorageResponse containing the created link storage entry
|
1687
|
+
|
1688
|
+
Raises:
|
1689
|
+
DataStorageCreationError: If there's an error creating the link storage entry
|
1690
|
+
"""
|
1691
|
+
try:
|
1692
|
+
link_metadata = metadata.copy() if metadata else {}
|
1693
|
+
link_metadata["instructions"] = instructions
|
1694
|
+
if api_key:
|
1695
|
+
link_metadata["api_key"] = api_key
|
1696
|
+
|
1697
|
+
existing_location = DataStorageLocationPayload(
|
1698
|
+
storage_type=DataStorageType.LINK,
|
1699
|
+
content_type=DataContentType.TEXT,
|
1700
|
+
location=url,
|
1701
|
+
metadata=link_metadata or None,
|
1702
|
+
)
|
1703
|
+
|
1704
|
+
payload = DataStorageRequestPayload(
|
1705
|
+
name=name,
|
1706
|
+
content=url,
|
1707
|
+
description=description,
|
1708
|
+
dataset_id=dataset_id,
|
1709
|
+
project_id=project_id,
|
1710
|
+
existing_location=existing_location,
|
1711
|
+
)
|
1712
|
+
return self._create_data_storage_entry(payload)
|
1713
|
+
except HTTPStatusError as e:
|
1714
|
+
self._handle_http_errors(e, "creating")
|
1715
|
+
except Exception as e:
|
1716
|
+
raise DataStorageCreationError(
|
1717
|
+
f"An unexpected error occurred: {e!r}"
|
1718
|
+
) from e
|
1719
|
+
|
1556
1720
|
@retry(
|
1557
1721
|
stop=stop_after_attempt(3),
|
1558
1722
|
wait=wait_exponential(multiplier=1, max=10),
|
@@ -1724,6 +1888,7 @@ class DataStorageMethods:
|
|
1724
1888
|
name: str,
|
1725
1889
|
existing_location: DataStorageLocationPayload,
|
1726
1890
|
description: str | None = None,
|
1891
|
+
as_collection: bool = False,
|
1727
1892
|
path: str | None = None,
|
1728
1893
|
project_id: UUID | None = None,
|
1729
1894
|
) -> DataStorageResponse:
|
@@ -1733,6 +1898,9 @@ class DataStorageMethods:
|
|
1733
1898
|
name: Name of the data storage entry
|
1734
1899
|
existing_location: Describes the existing data source location to register
|
1735
1900
|
description: Optional description of the data storage entry
|
1901
|
+
as_collection: If uploading a directory, `True` creates a single storage entry for
|
1902
|
+
the whole directory and multiple storage locations for each file, `False` assumes
|
1903
|
+
you are uploading a single file.
|
1736
1904
|
path: Optional path for the data storage entry
|
1737
1905
|
project_id: ID of the project this data storage entry belongs to
|
1738
1906
|
|
@@ -1749,9 +1917,11 @@ class DataStorageMethods:
|
|
1749
1917
|
path=path,
|
1750
1918
|
existing_location=existing_location,
|
1751
1919
|
project_id=project_id,
|
1920
|
+
is_collection=as_collection,
|
1752
1921
|
)
|
1753
1922
|
response = self.client.post(
|
1754
|
-
"/v0.1/data-storage",
|
1923
|
+
"/v0.1/data-storage/data-entries",
|
1924
|
+
json=payload.model_dump(exclude_none=True),
|
1755
1925
|
)
|
1756
1926
|
response.raise_for_status()
|
1757
1927
|
return DataStorageResponse.model_validate(response.json())
|
@@ -1772,6 +1942,7 @@ class DataStorageMethods:
|
|
1772
1942
|
self,
|
1773
1943
|
name: str,
|
1774
1944
|
existing_location: DataStorageLocationPayload,
|
1945
|
+
as_collection: bool = False,
|
1775
1946
|
description: str | None = None,
|
1776
1947
|
path: str | None = None,
|
1777
1948
|
project_id: UUID | None = None,
|
@@ -1782,6 +1953,9 @@ class DataStorageMethods:
|
|
1782
1953
|
name: Name of the data storage entry
|
1783
1954
|
existing_location: Describes the existing data source location to register
|
1784
1955
|
description: Optional description of the data storage entry
|
1956
|
+
as_collection: If uploading a directory, `True` creates a single storage entry for
|
1957
|
+
the whole directory and multiple storage locations for each file, `False` assumes
|
1958
|
+
you are uploading a single file.
|
1785
1959
|
path: Optional path for the data storage entry
|
1786
1960
|
project_id: ID of the project this data storage entry belongs to
|
1787
1961
|
|
@@ -1798,9 +1972,11 @@ class DataStorageMethods:
|
|
1798
1972
|
path=path,
|
1799
1973
|
existing_location=existing_location,
|
1800
1974
|
project_id=project_id,
|
1975
|
+
is_collection=as_collection,
|
1801
1976
|
)
|
1802
1977
|
response = await self.async_client.post(
|
1803
|
-
"/v0.1/data-storage",
|
1978
|
+
"/v0.1/data-storage/data-entries",
|
1979
|
+
json=payload.model_dump(exclude_none=True),
|
1804
1980
|
)
|
1805
1981
|
response.raise_for_status()
|
1806
1982
|
return DataStorageResponse.model_validate(response.json())
|
@@ -1811,8 +1987,274 @@ class DataStorageMethods:
|
|
1811
1987
|
f"An unexpected error occurred: {e!r}"
|
1812
1988
|
) from e
|
1813
1989
|
|
1814
|
-
|
1815
|
-
|
1990
|
+
@retry(
|
1991
|
+
stop=stop_after_attempt(3),
|
1992
|
+
wait=wait_exponential(multiplier=1, max=10),
|
1993
|
+
retry=retry_if_connection_error,
|
1994
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
1995
|
+
)
|
1996
|
+
def search_data_storage(
|
1997
|
+
self,
|
1998
|
+
criteria: list[SearchCriterion] | None = None,
|
1999
|
+
size: int = 10,
|
2000
|
+
) -> list[dict]:
|
2001
|
+
"""Search data storage objects using structured criteria.
|
2002
|
+
|
2003
|
+
Args:
|
2004
|
+
criteria: List of search criteria (SearchCriterion objects with field, operator, value)
|
2005
|
+
size: Number of results to return (1-100)
|
2006
|
+
|
2007
|
+
Returns:
|
2008
|
+
List of search results with scores and data storage information
|
2009
|
+
|
2010
|
+
Raises:
|
2011
|
+
DataStorageCreationError: If there's an error searching data storage entries
|
2012
|
+
|
2013
|
+
Example:
|
2014
|
+
from futurehouse_client.models.rest import SearchCriterion, SearchOperator
|
2015
|
+
criteria = [
|
2016
|
+
SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
|
2017
|
+
SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
|
2018
|
+
SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
|
2019
|
+
]
|
2020
|
+
results = client.search_data_storage(criteria=criteria, size=20)
|
2021
|
+
"""
|
2022
|
+
try:
|
2023
|
+
payload = DataStorageSearchPayload(
|
2024
|
+
criteria=criteria or [],
|
2025
|
+
size=max(1, min(100, size)), # Clamp between 1-100
|
2026
|
+
)
|
2027
|
+
|
2028
|
+
response = self.client.post(
|
2029
|
+
"/v0.1/data-storage/search",
|
2030
|
+
json=payload.model_dump(mode="json"),
|
2031
|
+
)
|
2032
|
+
response.raise_for_status()
|
2033
|
+
return response.json()
|
2034
|
+
|
2035
|
+
except HTTPStatusError as e:
|
2036
|
+
if e.response.status_code == codes.SERVICE_UNAVAILABLE:
|
2037
|
+
raise DataStorageCreationError(
|
2038
|
+
"Search functionality is currently unavailable"
|
2039
|
+
) from e
|
2040
|
+
self._handle_http_errors(e, "searching")
|
2041
|
+
except Exception as e:
|
2042
|
+
raise DataStorageCreationError(
|
2043
|
+
f"An unexpected error occurred during search: {e!r}"
|
2044
|
+
) from e
|
2045
|
+
|
2046
|
+
@retry(
|
2047
|
+
stop=stop_after_attempt(3),
|
2048
|
+
wait=wait_exponential(multiplier=1, max=10),
|
2049
|
+
retry=retry_if_connection_error,
|
2050
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2051
|
+
)
|
2052
|
+
async def asearch_data_storage(
|
2053
|
+
self,
|
2054
|
+
criteria: list[SearchCriterion] | None = None,
|
2055
|
+
size: int = 10,
|
2056
|
+
) -> list[dict]:
|
2057
|
+
"""Asynchronously search data storage objects using structured criteria.
|
2058
|
+
|
2059
|
+
Args:
|
2060
|
+
criteria: List of search criteria (SearchCriterion objects with field, operator, value)
|
2061
|
+
size: Number of results to return (1-100)
|
2062
|
+
|
2063
|
+
Returns:
|
2064
|
+
List of search results with scores and data storage information
|
2065
|
+
|
2066
|
+
Raises:
|
2067
|
+
DataStorageCreationError: If there's an error searching data storage entries
|
2068
|
+
|
2069
|
+
Example:
|
2070
|
+
from futurehouse_client.models.rest import SearchCriterion, SearchOperator
|
2071
|
+
criteria = [
|
2072
|
+
SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
|
2073
|
+
SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
|
2074
|
+
SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
|
2075
|
+
]
|
2076
|
+
results = await client.asearch_data_storage(criteria=criteria, size=20)
|
2077
|
+
"""
|
2078
|
+
try:
|
2079
|
+
payload = DataStorageSearchPayload(
|
2080
|
+
criteria=criteria or [],
|
2081
|
+
size=max(1, min(100, size)), # Clamp between 1-100
|
2082
|
+
)
|
2083
|
+
|
2084
|
+
response = await self.async_client.post(
|
2085
|
+
"/v0.1/data-storage/search",
|
2086
|
+
json=payload.model_dump(mode="json"),
|
2087
|
+
)
|
2088
|
+
response.raise_for_status()
|
2089
|
+
return response.json()
|
2090
|
+
|
2091
|
+
except HTTPStatusError as e:
|
2092
|
+
if e.response.status_code == codes.SERVICE_UNAVAILABLE:
|
2093
|
+
raise DataStorageCreationError(
|
2094
|
+
"Search functionality is currently unavailable"
|
2095
|
+
) from e
|
2096
|
+
self._handle_http_errors(e, "searching")
|
2097
|
+
except Exception as e:
|
2098
|
+
raise DataStorageCreationError(
|
2099
|
+
f"An unexpected error occurred during async search: {e!r}"
|
2100
|
+
) from e
|
2101
|
+
|
2102
|
+
@retry(
|
2103
|
+
stop=stop_after_attempt(3),
|
2104
|
+
wait=wait_exponential(multiplier=1, max=10),
|
2105
|
+
retry=retry_if_connection_error,
|
2106
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2107
|
+
)
|
2108
|
+
def similarity_search_data_storage(
|
2109
|
+
self,
|
2110
|
+
embedding: list[float],
|
2111
|
+
size: int = 10,
|
2112
|
+
min_score: float = 0.7,
|
2113
|
+
dataset_id: UUID | None = None,
|
2114
|
+
tags: list[str] | None = None,
|
2115
|
+
user_id: str | None = None,
|
2116
|
+
project_id: str | None = None,
|
2117
|
+
) -> list[dict]:
|
2118
|
+
"""Search data storage objects using vector similarity.
|
2119
|
+
|
2120
|
+
Args:
|
2121
|
+
embedding: Embedding vector for similarity search
|
2122
|
+
size: Number of results to return (1-100)
|
2123
|
+
min_score: Minimum similarity score (0.0-1.0)
|
2124
|
+
dataset_id: Optional dataset ID filter
|
2125
|
+
tags: Optional list of tags to filter by
|
2126
|
+
user_id: Optional user ID filter (admin only)
|
2127
|
+
project_id: Optional project ID filter
|
2128
|
+
|
2129
|
+
Returns:
|
2130
|
+
List of search results with similarity scores and data storage information
|
2131
|
+
|
2132
|
+
Raises:
|
2133
|
+
DataStorageCreationError: If there's an error performing similarity search
|
2134
|
+
"""
|
2135
|
+
try:
|
2136
|
+
# Validate inputs
|
2137
|
+
if not embedding:
|
2138
|
+
raise DataStorageCreationError("Embedding vector is required")
|
2139
|
+
|
2140
|
+
if not all(isinstance(x, int | float) for x in embedding):
|
2141
|
+
raise DataStorageCreationError("Embedding must be a list of numbers")
|
2142
|
+
|
2143
|
+
size = max(1, min(100, size)) # Clamp between 1-100
|
2144
|
+
min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
|
2145
|
+
|
2146
|
+
# Build request payload
|
2147
|
+
payload = {
|
2148
|
+
"embedding": embedding,
|
2149
|
+
"size": size,
|
2150
|
+
"min_score": min_score,
|
2151
|
+
}
|
2152
|
+
|
2153
|
+
# Add optional filters
|
2154
|
+
if dataset_id is not None:
|
2155
|
+
payload["dataset_id"] = str(dataset_id)
|
2156
|
+
if tags is not None:
|
2157
|
+
payload["tags"] = tags
|
2158
|
+
if user_id is not None:
|
2159
|
+
payload["user_id"] = user_id
|
2160
|
+
if project_id is not None:
|
2161
|
+
payload["project_id"] = project_id
|
2162
|
+
|
2163
|
+
response = self.client.post(
|
2164
|
+
"/v0.1/data-storage/similarity-search", json=payload
|
2165
|
+
)
|
2166
|
+
response.raise_for_status()
|
2167
|
+
return response.json()
|
2168
|
+
|
2169
|
+
except HTTPStatusError as e:
|
2170
|
+
if e.response.status_code == codes.SERVICE_UNAVAILABLE:
|
2171
|
+
raise DataStorageCreationError(
|
2172
|
+
"Similarity search functionality is currently unavailable"
|
2173
|
+
) from e
|
2174
|
+
self._handle_http_errors(e, "performing similarity search")
|
2175
|
+
except Exception as e:
|
2176
|
+
raise DataStorageCreationError(
|
2177
|
+
f"An unexpected error occurred during similarity search: {e!r}"
|
2178
|
+
) from e
|
2179
|
+
|
2180
|
+
@retry(
|
2181
|
+
stop=stop_after_attempt(3),
|
2182
|
+
wait=wait_exponential(multiplier=1, max=10),
|
2183
|
+
retry=retry_if_connection_error,
|
2184
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2185
|
+
)
|
2186
|
+
async def asimilarity_search_data_storage(
|
2187
|
+
self,
|
2188
|
+
embedding: list[float],
|
2189
|
+
size: int = 10,
|
2190
|
+
min_score: float = 0.7,
|
2191
|
+
dataset_id: UUID | None = None,
|
2192
|
+
tags: list[str] | None = None,
|
2193
|
+
user_id: str | None = None,
|
2194
|
+
project_id: str | None = None,
|
2195
|
+
) -> list[dict]:
|
2196
|
+
"""Asynchronously search data storage objects using vector similarity.
|
2197
|
+
|
2198
|
+
Args:
|
2199
|
+
embedding: Embedding vector for similarity search
|
2200
|
+
size: Number of results to return (1-100)
|
2201
|
+
min_score: Minimum similarity score (0.0-1.0)
|
2202
|
+
dataset_id: Optional dataset ID filter
|
2203
|
+
tags: Optional list of tags to filter by
|
2204
|
+
user_id: Optional user ID filter (admin only)
|
2205
|
+
project_id: Optional project ID filter
|
2206
|
+
|
2207
|
+
Returns:
|
2208
|
+
List of search results with similarity scores and data storage information
|
2209
|
+
|
2210
|
+
Raises:
|
2211
|
+
DataStorageCreationError: If there's an error performing similarity search
|
2212
|
+
"""
|
2213
|
+
try:
|
2214
|
+
# Validate inputs
|
2215
|
+
if not embedding:
|
2216
|
+
raise DataStorageCreationError("Embedding vector is required")
|
2217
|
+
|
2218
|
+
if not all(isinstance(x, int | float) for x in embedding):
|
2219
|
+
raise DataStorageCreationError("Embedding must be a list of numbers")
|
2220
|
+
|
2221
|
+
size = max(1, min(100, size)) # Clamp between 1-100
|
2222
|
+
min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
|
2223
|
+
|
2224
|
+
# Build request payload
|
2225
|
+
payload = {
|
2226
|
+
"embedding": embedding,
|
2227
|
+
"size": size,
|
2228
|
+
"min_score": min_score,
|
2229
|
+
}
|
2230
|
+
|
2231
|
+
# Add optional filters
|
2232
|
+
if dataset_id is not None:
|
2233
|
+
payload["dataset_id"] = str(dataset_id)
|
2234
|
+
if tags is not None:
|
2235
|
+
payload["tags"] = tags
|
2236
|
+
if user_id is not None:
|
2237
|
+
payload["user_id"] = user_id
|
2238
|
+
if project_id is not None:
|
2239
|
+
payload["project_id"] = project_id
|
2240
|
+
|
2241
|
+
response = await self.async_client.post(
|
2242
|
+
"/v0.1/data-storage/similarity-search", json=payload
|
2243
|
+
)
|
2244
|
+
response.raise_for_status()
|
2245
|
+
return response.json()
|
2246
|
+
|
2247
|
+
except HTTPStatusError as e:
|
2248
|
+
if e.response.status_code == codes.SERVICE_UNAVAILABLE:
|
2249
|
+
raise DataStorageCreationError(
|
2250
|
+
"Similarity search functionality is currently unavailable"
|
2251
|
+
) from e
|
2252
|
+
self._handle_http_errors(e, "performing similarity search")
|
2253
|
+
except Exception as e:
|
2254
|
+
raise DataStorageCreationError(
|
2255
|
+
f"An unexpected error occurred during async similarity search: {e!r}"
|
2256
|
+
) from e
|
2257
|
+
|
1816
2258
|
@retry(
|
1817
2259
|
stop=stop_after_attempt(3),
|
1818
2260
|
wait=wait_exponential(multiplier=1, max=10),
|
@@ -1822,7 +2264,7 @@ class DataStorageMethods:
|
|
1822
2264
|
def fetch_data_from_storage(
|
1823
2265
|
self,
|
1824
2266
|
data_storage_id: UUID | None = None,
|
1825
|
-
) -> str | Path | None:
|
2267
|
+
) -> str | Path | list[Path] | None:
|
1826
2268
|
"""Fetch data from the storage system (sync version).
|
1827
2269
|
|
1828
2270
|
Args:
|
@@ -1831,27 +2273,43 @@ class DataStorageMethods:
|
|
1831
2273
|
Returns:
|
1832
2274
|
For PG_TABLE storage: string content
|
1833
2275
|
For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
|
2276
|
+
For multi-location entries: dict of location IDs to dicts with signed URL and file name
|
1834
2277
|
None if not found or error occurred
|
1835
2278
|
"""
|
1836
2279
|
if not data_storage_id:
|
1837
|
-
raise
|
2280
|
+
raise DataStorageRetrievalError(
|
1838
2281
|
"data_storage_id must be provided at this time"
|
1839
2282
|
)
|
1840
2283
|
|
1841
2284
|
try:
|
1842
|
-
response = self.client.get(
|
2285
|
+
response = self.client.get(
|
2286
|
+
f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
|
2287
|
+
)
|
1843
2288
|
response.raise_for_status()
|
1844
2289
|
result = DataStorageResponse.model_validate(response.json())
|
1845
2290
|
|
1846
|
-
|
2291
|
+
if len(result.storage_locations) > 1:
|
2292
|
+
return [
|
2293
|
+
self._download_from_gcs(
|
2294
|
+
location.storage_config.signed_url or "",
|
2295
|
+
(location.storage_config.location or "").split("/")[-1],
|
2296
|
+
)
|
2297
|
+
for location in result.storage_locations
|
2298
|
+
]
|
2299
|
+
|
2300
|
+
# Most scenarios will only have one location
|
2301
|
+
storage_location = result.storage_locations[0]
|
2302
|
+
storage_type = storage_location.storage_config.storage_type
|
1847
2303
|
|
1848
2304
|
if storage_type == "gcs":
|
1849
|
-
if not
|
1850
|
-
raise
|
2305
|
+
if not storage_location.storage_config.signed_url:
|
2306
|
+
raise DataStorageRetrievalError(
|
1851
2307
|
"No signed URL available for GCS download"
|
1852
2308
|
)
|
1853
2309
|
|
1854
|
-
return self._download_from_gcs(
|
2310
|
+
return self._download_from_gcs(
|
2311
|
+
storage_location.storage_config.signed_url
|
2312
|
+
)
|
1855
2313
|
|
1856
2314
|
if storage_type in {"raw_content", "pg_table"}:
|
1857
2315
|
content = result.data_storage.content
|
@@ -1862,12 +2320,12 @@ class DataStorageMethods:
|
|
1862
2320
|
return None
|
1863
2321
|
return content
|
1864
2322
|
|
1865
|
-
raise
|
2323
|
+
raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
|
1866
2324
|
|
1867
2325
|
except HTTPStatusError as e:
|
1868
2326
|
self._handle_http_errors(e, "retrieving")
|
1869
2327
|
except Exception as e:
|
1870
|
-
raise
|
2328
|
+
raise DataStorageRetrievalError(
|
1871
2329
|
f"An unexpected error occurred: {e!r}"
|
1872
2330
|
) from e
|
1873
2331
|
|
@@ -1880,7 +2338,7 @@ class DataStorageMethods:
|
|
1880
2338
|
async def afetch_data_from_storage(
|
1881
2339
|
self,
|
1882
2340
|
data_storage_id: UUID | None = None,
|
1883
|
-
) -> str | Path | None:
|
2341
|
+
) -> str | Path | list[Path] | None:
|
1884
2342
|
"""Fetch data from the storage system.
|
1885
2343
|
|
1886
2344
|
Args:
|
@@ -1889,29 +2347,46 @@ class DataStorageMethods:
|
|
1889
2347
|
Returns:
|
1890
2348
|
For PG_TABLE storage: string content
|
1891
2349
|
For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
|
2350
|
+
For multi-location entries: dict of location IDs to dicts with signed URL and file name
|
1892
2351
|
None if not found or error occurred
|
1893
2352
|
"""
|
1894
2353
|
if not data_storage_id:
|
1895
|
-
raise
|
2354
|
+
raise DataStorageRetrievalError(
|
1896
2355
|
"data_storage_id must be provided at this time"
|
1897
2356
|
)
|
1898
2357
|
|
1899
2358
|
try:
|
1900
2359
|
response = await self.async_client.get(
|
1901
|
-
f"/v0.1/data-storage/{data_storage_id}"
|
2360
|
+
f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
|
1902
2361
|
)
|
1903
2362
|
response.raise_for_status()
|
1904
2363
|
result = DataStorageResponse.model_validate(response.json())
|
1905
2364
|
|
1906
|
-
|
2365
|
+
if len(result.storage_locations) > 1:
|
2366
|
+
return await gather_with_concurrency(
|
2367
|
+
DOWNLOAD_CONCURRENCY,
|
2368
|
+
[
|
2369
|
+
self._adownload_from_gcs(
|
2370
|
+
location.storage_config.signed_url or "",
|
2371
|
+
(location.storage_config.location or "").split("/")[-1],
|
2372
|
+
)
|
2373
|
+
for location in result.storage_locations
|
2374
|
+
],
|
2375
|
+
)
|
2376
|
+
|
2377
|
+
# Most scenarios will only have one location
|
2378
|
+
storage_location = result.storage_locations[0]
|
2379
|
+
storage_type = storage_location.storage_config.storage_type
|
1907
2380
|
|
1908
2381
|
if storage_type == "gcs":
|
1909
|
-
if not
|
1910
|
-
raise
|
2382
|
+
if not storage_location.storage_config.signed_url:
|
2383
|
+
raise DataStorageRetrievalError(
|
1911
2384
|
"No signed URL available for GCS download"
|
1912
2385
|
)
|
1913
2386
|
|
1914
|
-
return await self._adownload_from_gcs(
|
2387
|
+
return await self._adownload_from_gcs(
|
2388
|
+
storage_location.storage_config.signed_url
|
2389
|
+
)
|
1915
2390
|
|
1916
2391
|
if storage_type in {"raw_content", "pg_table"}:
|
1917
2392
|
content = result.data_storage.content
|
@@ -1922,11 +2397,189 @@ class DataStorageMethods:
|
|
1922
2397
|
return None
|
1923
2398
|
return content
|
1924
2399
|
|
1925
|
-
raise
|
2400
|
+
raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
|
1926
2401
|
|
1927
2402
|
except HTTPStatusError as e:
|
1928
2403
|
self._handle_http_errors(e, "retrieving")
|
2404
|
+
except Exception as e:
|
2405
|
+
raise DataStorageRetrievalError(
|
2406
|
+
f"An unexpected error occurred: {e!r}"
|
2407
|
+
) from e
|
2408
|
+
|
2409
|
+
@retry(
|
2410
|
+
stop=stop_after_attempt(3),
|
2411
|
+
wait=wait_exponential(multiplier=1, max=10),
|
2412
|
+
retry=retry_if_connection_error,
|
2413
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2414
|
+
)
|
2415
|
+
async def acreate_dataset(
|
2416
|
+
self,
|
2417
|
+
name: str,
|
2418
|
+
description: str | None = None,
|
2419
|
+
dataset_id: UUID | None = None,
|
2420
|
+
):
|
2421
|
+
try:
|
2422
|
+
payload = CreateDatasetPayload(
|
2423
|
+
name=name,
|
2424
|
+
description=description,
|
2425
|
+
id=dataset_id,
|
2426
|
+
)
|
2427
|
+
response = await self.async_client.post(
|
2428
|
+
"/v0.1/data-storage/datasets",
|
2429
|
+
json=payload.model_dump(exclude_none=True),
|
2430
|
+
)
|
2431
|
+
response.raise_for_status()
|
2432
|
+
return CreateDatasetPayload.model_validate(response.json())
|
2433
|
+
except HTTPStatusError as e:
|
2434
|
+
self._handle_http_errors(e, "creating")
|
1929
2435
|
except Exception as e:
|
1930
2436
|
raise DataStorageCreationError(
|
1931
2437
|
f"An unexpected error occurred: {e!r}"
|
1932
2438
|
) from e
|
2439
|
+
|
2440
|
+
@retry(
|
2441
|
+
stop=stop_after_attempt(3),
|
2442
|
+
wait=wait_exponential(multiplier=1, max=10),
|
2443
|
+
retry=retry_if_connection_error,
|
2444
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2445
|
+
)
|
2446
|
+
def create_dataset(
|
2447
|
+
self,
|
2448
|
+
name: str,
|
2449
|
+
description: str | None = None,
|
2450
|
+
dataset_id: UUID | None = None,
|
2451
|
+
):
|
2452
|
+
try:
|
2453
|
+
payload = CreateDatasetPayload(
|
2454
|
+
name=name,
|
2455
|
+
description=description,
|
2456
|
+
id=dataset_id,
|
2457
|
+
)
|
2458
|
+
response = self.client.post(
|
2459
|
+
"/v0.1/data-storage/datasets",
|
2460
|
+
json=payload.model_dump(exclude_none=True),
|
2461
|
+
)
|
2462
|
+
response.raise_for_status()
|
2463
|
+
return CreateDatasetPayload.model_validate(response.json())
|
2464
|
+
except HTTPStatusError as e:
|
2465
|
+
self._handle_http_errors(e, "creating")
|
2466
|
+
except Exception as e:
|
2467
|
+
raise DataStorageCreationError(
|
2468
|
+
f"An unexpected error occurred: {e!r}"
|
2469
|
+
) from e
|
2470
|
+
|
2471
|
+
@retry(
|
2472
|
+
stop=stop_after_attempt(3),
|
2473
|
+
wait=wait_exponential(multiplier=1, max=10),
|
2474
|
+
retry=retry_if_connection_error,
|
2475
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2476
|
+
)
|
2477
|
+
async def adelete_dataset(self, dataset_id: UUID):
|
2478
|
+
"""Delete a dataset.
|
2479
|
+
|
2480
|
+
Note: This will delete all data storage entries associated with the dataset.
|
2481
|
+
|
2482
|
+
Args:
|
2483
|
+
dataset_id: ID of the dataset to delete
|
2484
|
+
|
2485
|
+
Raises:
|
2486
|
+
DataStorageError: If there's an error deleting the dataset
|
2487
|
+
"""
|
2488
|
+
try:
|
2489
|
+
await self.async_client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
|
2490
|
+
except HTTPStatusError as e:
|
2491
|
+
self._handle_http_errors(e, "deleting")
|
2492
|
+
except Exception as e:
|
2493
|
+
raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
|
2494
|
+
|
2495
|
+
@retry(
|
2496
|
+
stop=stop_after_attempt(3),
|
2497
|
+
wait=wait_exponential(multiplier=1, max=10),
|
2498
|
+
retry=retry_if_connection_error,
|
2499
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2500
|
+
)
|
2501
|
+
def delete_dataset(self, dataset_id: UUID):
|
2502
|
+
"""Delete a dataset.
|
2503
|
+
|
2504
|
+
Note: This will delete all data storage entries associated with the dataset.
|
2505
|
+
|
2506
|
+
Args:
|
2507
|
+
dataset_id: ID of the dataset to delete
|
2508
|
+
|
2509
|
+
Raises:
|
2510
|
+
DataStorageError: If there's an error deleting the dataset
|
2511
|
+
"""
|
2512
|
+
try:
|
2513
|
+
self.client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
|
2514
|
+
except HTTPStatusError as e:
|
2515
|
+
self._handle_http_errors(e, "deleting")
|
2516
|
+
except Exception as e:
|
2517
|
+
raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
|
2518
|
+
|
2519
|
+
@retry(
|
2520
|
+
stop=stop_after_attempt(3),
|
2521
|
+
wait=wait_exponential(multiplier=1, max=10),
|
2522
|
+
retry=retry_if_connection_error,
|
2523
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2524
|
+
)
|
2525
|
+
async def aget_dataset(self, dataset_id: UUID):
|
2526
|
+
try:
|
2527
|
+
response = await self.async_client.get(
|
2528
|
+
f"/v0.1/data-storage/datasets/{dataset_id}"
|
2529
|
+
)
|
2530
|
+
response.raise_for_status()
|
2531
|
+
|
2532
|
+
return response.json()
|
2533
|
+
except HTTPStatusError as e:
|
2534
|
+
self._handle_http_errors(e, "retrieving")
|
2535
|
+
except Exception as e:
|
2536
|
+
raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
|
2537
|
+
|
2538
|
+
@retry(
|
2539
|
+
stop=stop_after_attempt(3),
|
2540
|
+
wait=wait_exponential(multiplier=1, max=10),
|
2541
|
+
retry=retry_if_connection_error,
|
2542
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2543
|
+
)
|
2544
|
+
def get_dataset(self, dataset_id: UUID):
|
2545
|
+
try:
|
2546
|
+
response = self.client.get(f"/v0.1/data-storage/datasets/{dataset_id}")
|
2547
|
+
response.raise_for_status()
|
2548
|
+
|
2549
|
+
return response.json()
|
2550
|
+
except HTTPStatusError as e:
|
2551
|
+
self._handle_http_errors(e, "retrieving")
|
2552
|
+
except Exception as e:
|
2553
|
+
raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
|
2554
|
+
|
2555
|
+
@retry(
|
2556
|
+
stop=stop_after_attempt(3),
|
2557
|
+
wait=wait_exponential(multiplier=1, max=10),
|
2558
|
+
retry=retry_if_connection_error,
|
2559
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2560
|
+
)
|
2561
|
+
async def adelete_data_storage_entry(self, data_storage_entry_id: UUID):
|
2562
|
+
try:
|
2563
|
+
await self.async_client.delete(
|
2564
|
+
f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
|
2565
|
+
)
|
2566
|
+
except HTTPStatusError as e:
|
2567
|
+
self._handle_http_errors(e, "deleting")
|
2568
|
+
except Exception as e:
|
2569
|
+
raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
|
2570
|
+
|
2571
|
+
@retry(
|
2572
|
+
stop=stop_after_attempt(3),
|
2573
|
+
wait=wait_exponential(multiplier=1, max=10),
|
2574
|
+
retry=retry_if_connection_error,
|
2575
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2576
|
+
)
|
2577
|
+
def delete_data_storage_entry(self, data_storage_entry_id: UUID):
|
2578
|
+
try:
|
2579
|
+
self.client.delete(
|
2580
|
+
f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
|
2581
|
+
)
|
2582
|
+
except HTTPStatusError as e:
|
2583
|
+
self._handle_http_errors(e, "deleting")
|
2584
|
+
except Exception as e:
|
2585
|
+
raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
|