futurehouse-client 0.4.1.dev95__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,7 @@ import aiohttp
15
15
  import requests as requests_lib
16
16
  from google.resumable_media import requests as resumable_requests
17
17
  from httpx import AsyncClient, Client, HTTPStatusError, codes
18
+ from lmi.utils import gather_with_concurrency
18
19
  from requests.adapters import HTTPAdapter
19
20
  from tenacity import (
20
21
  before_sleep_log,
@@ -26,12 +27,17 @@ from tqdm import tqdm
26
27
  from urllib3.util.retry import Retry
27
28
 
28
29
  from futurehouse_client.models.data_storage_methods import (
30
+ CreateDatasetPayload,
29
31
  DataStorageLocationPayload,
30
32
  DataStorageRequestPayload,
31
33
  DataStorageResponse,
32
34
  DirectoryManifest,
33
35
  ManifestEntry,
34
36
  )
37
+ from futurehouse_client.models.rest import (
38
+ DataStorageSearchPayload,
39
+ SearchCriterion,
40
+ )
35
41
  from futurehouse_client.utils.general import retry_if_connection_error
36
42
 
37
43
  # this is only required if they're using a yaml manifest
@@ -54,6 +60,7 @@ INITIATE_HEADERS = {
54
60
  "x-goog-resumable": "start",
55
61
  "Content-Length": "0",
56
62
  }
63
+ DOWNLOAD_CONCURRENCY = 3
57
64
 
58
65
 
59
66
  def _should_ignore_file(
@@ -438,6 +445,10 @@ class DataStorageCreationError(DataStorageError):
438
445
  """Raised when there's an error creating a data storage entry."""
439
446
 
440
447
 
448
+ class DataStorageRetrievalError(DataStorageError):
449
+ """Raised when there's an error retrieving a data storage entry."""
450
+
451
+
441
452
  class ProgressWrapper:
442
453
  """Common progress wrapper for file uploads."""
443
454
 
@@ -462,32 +473,30 @@ class ProgressWrapper:
462
473
  return self.file_obj.tell()
463
474
 
464
475
 
465
- class DataStorageMethods:
476
+ class DataStorageMethods: # pylint: disable=too-many-public-methods
466
477
  """Data storage methods for RestClient.
467
478
 
468
479
  This class contains methods for interacting with the data storage API endpoints.
469
480
  """
470
481
 
471
482
  # needed for mypy `NoReturn`
472
- def _handle_http_errors(self, e: HTTPStatusError) -> NoReturn:
483
+ def _handle_http_errors(self, e: HTTPStatusError, operation: str) -> NoReturn:
473
484
  """Handle common HTTP errors for data storage operations."""
474
485
  if e.response.status_code == codes.FORBIDDEN:
475
- raise DataStorageCreationError(
476
- "Not authorized to create data storage entries"
486
+ raise DataStorageError(
487
+ f"Error {operation} data storage entry, not authorized"
477
488
  ) from e
478
489
  if e.response.status_code == codes.UNPROCESSABLE_ENTITY:
479
- raise DataStorageCreationError(
480
- f"Invalid request payload: {e.response.text}"
481
- ) from e
482
- raise DataStorageCreationError(
483
- f"Error creating data storage entry: {e.response.status_code} - {e.response.text}"
490
+ raise DataStorageError(f"Invalid request payload: {e.response.text}") from e
491
+ raise DataStorageError(
492
+ f"Error {operation} data storage entry: {e.response.status_code} - {e.response.text}"
484
493
  ) from e
485
494
 
486
495
  def _validate_file_path(self, file_path: str | Path) -> Path:
487
496
  """Validate file path exists and return Path object."""
488
497
  file_path = Path(file_path)
489
498
  if not file_path.exists():
490
- raise DataStorageCreationError(f"File or directory not found: {file_path}")
499
+ raise DataStorageError(f"File or directory not found: {file_path}")
491
500
  return file_path
492
501
 
493
502
  def _build_zip_path(self, name: str, path: str | None) -> str:
@@ -529,19 +538,24 @@ class DataStorageMethods:
529
538
  return extracted_items[0]
530
539
  return extract_dir
531
540
 
532
- async def _adownload_from_gcs(self, signed_url: str) -> Path:
541
+ async def _adownload_from_gcs(
542
+ self, signed_url: str, file_name: str | None = None
543
+ ) -> Path:
533
544
  """Download file from GCS using signed URL and handle unzipping if needed.
534
545
 
535
546
  Args:
536
547
  signed_url: The signed URL to download from
548
+ file_name: The name of the file to download
537
549
 
538
550
  Returns:
539
551
  Path to the downloaded file (or unzipped directory if it was a zip)
540
552
  """
553
+ file_name = file_name or "downloaded_file"
554
+
541
555
  try:
542
556
  with tempfile.TemporaryDirectory() as temp_dir_str:
543
557
  temp_dir = Path(temp_dir_str)
544
- temp_file = temp_dir / "downloaded_file"
558
+ temp_file = temp_dir / file_name
545
559
 
546
560
  async with self.async_client.stream("GET", signed_url) as response:
547
561
  response.raise_for_status()
@@ -549,11 +563,11 @@ class DataStorageMethods:
549
563
  content_disposition = response.headers.get(
550
564
  "content-disposition", ""
551
565
  )
552
- filename = "downloaded_file"
566
+ filename = file_name
553
567
  if "filename=" in content_disposition:
554
568
  filename = content_disposition.split("filename=")[-1].strip('"')
555
569
 
556
- if filename != "downloaded_file":
570
+ if filename != file_name:
557
571
  temp_file = temp_dir / filename
558
572
 
559
573
  async with aiofiles.open(temp_file, "wb") as f:
@@ -583,21 +597,23 @@ class DataStorageMethods:
583
597
  return final_file
584
598
 
585
599
  except Exception as e:
586
- raise DataStorageCreationError(f"Failed to download from GCS: {e}") from e
600
+ raise DataStorageError(f"Failed to download from GCS: {e}") from e
587
601
 
588
- def _download_from_gcs(self, signed_url: str) -> Path:
602
+ def _download_from_gcs(self, signed_url: str, file_name: str | None = None) -> Path:
589
603
  """Download file from GCS using signed URL and handle unzipping if needed (sync version).
590
604
 
591
605
  Args:
592
606
  signed_url: The signed URL to download from
593
-
607
+ file_name: The name of the file to download
594
608
  Returns:
595
609
  Path to the downloaded file (or unzipped directory if it was a zip)
596
610
  """
611
+ file_name = file_name or "downloaded_file"
612
+
597
613
  try:
598
614
  with tempfile.TemporaryDirectory() as temp_dir_str:
599
615
  temp_dir = Path(temp_dir_str)
600
- temp_file = temp_dir / "downloaded_file"
616
+ temp_file = temp_dir / file_name
601
617
 
602
618
  with requests_lib.get(signed_url, stream=True, timeout=30) as response:
603
619
  response.raise_for_status()
@@ -605,11 +621,11 @@ class DataStorageMethods:
605
621
  content_disposition = response.headers.get(
606
622
  "content-disposition", ""
607
623
  )
608
- filename = "downloaded_file"
624
+ filename = file_name
609
625
  if "filename=" in content_disposition:
610
626
  filename = content_disposition.split("filename=")[-1].strip('"')
611
627
 
612
- if filename != "downloaded_file":
628
+ if filename != file_name:
613
629
  temp_file = temp_dir / filename
614
630
 
615
631
  with open(temp_file, "wb") as f:
@@ -639,7 +655,7 @@ class DataStorageMethods:
639
655
  return final_file
640
656
 
641
657
  except Exception as e:
642
- raise DataStorageCreationError(f"Failed to download from GCS: {e}") from e
658
+ raise DataStorageError(f"Failed to download from GCS: {e}") from e
643
659
 
644
660
  # =====================================
645
661
 
@@ -676,7 +692,7 @@ class DataStorageMethods:
676
692
  ) -> DataStorageResponse:
677
693
  """Create data storage entry via API (sync version)."""
678
694
  response = self.client.post(
679
- "/v0.1/data-storage",
695
+ "/v0.1/data-storage/data-entries",
680
696
  json=payload.model_dump(mode="json", exclude_none=True),
681
697
  )
682
698
  response.raise_for_status()
@@ -687,7 +703,7 @@ class DataStorageMethods:
687
703
  ) -> DataStorageResponse:
688
704
  """Create data storage entry via API (async version)."""
689
705
  response = await self.async_client.post(
690
- "/v0.1/data-storage",
706
+ "/v0.1/data-storage/data-entries",
691
707
  json=payload.model_dump(mode="json", exclude_none=True),
692
708
  )
693
709
  response.raise_for_status()
@@ -761,6 +777,7 @@ class DataStorageMethods:
761
777
  path: str | None = None,
762
778
  ignore_patterns: list[str] | None = None,
763
779
  ignore_filename: str = ".gitignore",
780
+ project_id: UUID | None = None,
764
781
  ) -> DataStorageResponse:
765
782
  """Upload a directory as a single zip file collection.
766
783
 
@@ -771,6 +788,7 @@ class DataStorageMethods:
771
788
  path: Optional GCS path for the zip file
772
789
  ignore_patterns: List of patterns to ignore when zipping
773
790
  ignore_filename: Name of ignore file to read from directory
791
+ project_id: ID of the project this data storage entry belongs to
774
792
 
775
793
  Returns:
776
794
  DataStorageResponse for the uploaded zip file
@@ -790,6 +808,7 @@ class DataStorageMethods:
790
808
  description=description,
791
809
  path=zip_gcs_path,
792
810
  is_collection=True,
811
+ project_id=project_id,
793
812
  )
794
813
 
795
814
  logger.debug(
@@ -797,24 +816,30 @@ class DataStorageMethods:
797
816
  )
798
817
  data_storage_response = self._create_data_storage_entry(payload)
799
818
 
800
- if not data_storage_response.signed_url:
801
- raise DataStorageCreationError("No signed URL returned for zip upload")
819
+ for storage_location in data_storage_response.storage_locations:
820
+ if not storage_location.storage_config.signed_url:
821
+ raise DataStorageCreationError(
822
+ "No signed URL returned for zip upload"
823
+ )
802
824
 
803
- with tqdm(
804
- total=zip_size,
805
- unit="B",
806
- unit_scale=True,
807
- unit_divisor=1024,
808
- desc=f"Uploading {dir_path.name} (zipped)",
809
- miniters=1,
810
- mininterval=0.1,
811
- ) as pbar:
812
- _upload_file_with_progress(
813
- data_storage_response.signed_url, temp_zip_path, pbar, zip_size
814
- )
825
+ with tqdm(
826
+ total=zip_size,
827
+ unit="B",
828
+ unit_scale=True,
829
+ unit_divisor=1024,
830
+ desc=f"Uploading {dir_path.name} (zipped)",
831
+ miniters=1,
832
+ mininterval=0.1,
833
+ ) as pbar:
834
+ _upload_file_with_progress(
835
+ storage_location.storage_config.signed_url,
836
+ temp_zip_path,
837
+ pbar,
838
+ zip_size,
839
+ )
815
840
 
816
841
  status_response = self.client.patch(
817
- f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
842
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
818
843
  json={"status": "active"},
819
844
  )
820
845
  status_response.raise_for_status()
@@ -832,6 +857,7 @@ class DataStorageMethods:
832
857
  path: str | None = None,
833
858
  ignore_patterns: list[str] | None = None,
834
859
  ignore_filename: str = ".gitignore",
860
+ project_id: UUID | None = None,
835
861
  ) -> DataStorageResponse:
836
862
  """Asynchronously upload a directory as a single zip file.
837
863
 
@@ -842,6 +868,7 @@ class DataStorageMethods:
842
868
  path: Optional GCS path for the zip file
843
869
  ignore_patterns: List of patterns to ignore when zipping
844
870
  ignore_filename: Name of ignore file to read from directory
871
+ project_id: ID of the project this data storage entry belongs to
845
872
 
846
873
  Returns:
847
874
  DataStorageResponse for the uploaded zip file
@@ -861,28 +888,35 @@ class DataStorageMethods:
861
888
  description=description,
862
889
  path=zip_gcs_path,
863
890
  is_collection=True,
891
+ project_id=project_id,
864
892
  )
865
893
 
866
894
  data_storage_response = await self._acreate_data_storage_entry(payload)
867
895
 
868
- if not data_storage_response.signed_url:
869
- raise DataStorageCreationError("No signed URL returned for zip upload")
896
+ for storage_location in data_storage_response.storage_locations:
897
+ if not storage_location.storage_config.signed_url:
898
+ raise DataStorageCreationError(
899
+ "No signed URL returned for zip upload"
900
+ )
870
901
 
871
- with tqdm(
872
- total=zip_size,
873
- unit="B",
874
- unit_scale=True,
875
- unit_divisor=1024,
876
- desc=f"Uploading {dir_path.name} (zipped)",
877
- miniters=1,
878
- mininterval=0.1,
879
- ) as pbar:
880
- await _aupload_file_with_progress(
881
- data_storage_response.signed_url, temp_zip_path, pbar, zip_size
882
- )
902
+ with tqdm(
903
+ total=zip_size,
904
+ unit="B",
905
+ unit_scale=True,
906
+ unit_divisor=1024,
907
+ desc=f"Uploading {dir_path.name} (zipped)",
908
+ miniters=1,
909
+ mininterval=0.1,
910
+ ) as pbar:
911
+ await _aupload_file_with_progress(
912
+ storage_location.storage_config.signed_url,
913
+ temp_zip_path,
914
+ pbar,
915
+ zip_size,
916
+ )
883
917
 
884
918
  status_response = await self.async_client.patch(
885
- f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
919
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
886
920
  json={"status": "active"},
887
921
  )
888
922
  status_response.raise_for_status()
@@ -898,6 +932,7 @@ class DataStorageMethods:
898
932
  file_path: Path,
899
933
  description: str | None,
900
934
  path: str | None = None,
935
+ project_id: UUID | None = None,
901
936
  ) -> DataStorageResponse:
902
937
  """Upload a single file."""
903
938
  file_size = file_path.stat().st_size
@@ -918,6 +953,7 @@ class DataStorageMethods:
918
953
  content=text_content,
919
954
  path=path,
920
955
  is_collection=False,
956
+ project_id=project_id,
921
957
  )
922
958
 
923
959
  logger.debug("Sending file as text content")
@@ -934,6 +970,7 @@ class DataStorageMethods:
934
970
  description=description,
935
971
  path=path,
936
972
  is_collection=False,
973
+ project_id=project_id,
937
974
  )
938
975
 
939
976
  logger.debug(
@@ -942,30 +979,34 @@ class DataStorageMethods:
942
979
 
943
980
  data_storage_response = self._create_data_storage_entry(payload)
944
981
 
945
- if not data_storage_response.signed_url:
946
- raise DataStorageCreationError("No signed URL returned from server")
982
+ for storage_location in data_storage_response.storage_locations:
983
+ if not storage_location.storage_config.signed_url:
984
+ raise DataStorageCreationError("No signed URL returned from server")
947
985
 
948
- with tqdm(
949
- total=file_size,
950
- unit="B",
951
- unit_scale=True,
952
- unit_divisor=1024,
953
- desc=f"Uploading {file_path.name}",
954
- miniters=1,
955
- mininterval=0.1,
956
- ) as pbar:
957
- try:
958
- _upload_file_with_progress(
959
- data_storage_response.signed_url, file_path, pbar, file_size
960
- )
961
- logger.debug("File upload to signed URL completed successfully")
962
- except Exception as e:
963
- logger.error(f"Failed to upload file to signed URL: {e}")
964
- raise
986
+ with tqdm(
987
+ total=file_size,
988
+ unit="B",
989
+ unit_scale=True,
990
+ unit_divisor=1024,
991
+ desc=f"Uploading {file_path.name}",
992
+ miniters=1,
993
+ mininterval=0.1,
994
+ ) as pbar:
995
+ try:
996
+ _upload_file_with_progress(
997
+ storage_location.storage_config.signed_url,
998
+ file_path,
999
+ pbar,
1000
+ file_size,
1001
+ )
1002
+ logger.debug("File upload to signed URL completed successfully")
1003
+ except Exception as e:
1004
+ logger.error(f"Failed to upload file to signed URL: {e}")
1005
+ raise
965
1006
 
966
1007
  logger.debug("Updating data storage status to active")
967
1008
  status_response = self.client.patch(
968
- f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
1009
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
969
1010
  json={"status": "active"},
970
1011
  )
971
1012
  status_response.raise_for_status()
@@ -980,6 +1021,7 @@ class DataStorageMethods:
980
1021
  description: str | None,
981
1022
  path: str | None = None,
982
1023
  dataset_id: UUID | None = None,
1024
+ project_id: UUID | None = None,
983
1025
  ) -> DataStorageResponse:
984
1026
  """Asynchronously upload a single file."""
985
1027
  file_size, text_payload = self._prepare_single_file_upload(
@@ -1000,28 +1042,33 @@ class DataStorageMethods:
1000
1042
  path=path,
1001
1043
  is_collection=False,
1002
1044
  dataset_id=dataset_id,
1045
+ project_id=project_id,
1003
1046
  )
1004
1047
 
1005
1048
  data_storage_response = await self._acreate_data_storage_entry(payload)
1006
1049
 
1007
- if not data_storage_response.signed_url:
1008
- raise DataStorageCreationError("No signed URL returned from server")
1050
+ for location in data_storage_response.storage_locations:
1051
+ if not location.storage_config.signed_url:
1052
+ raise DataStorageCreationError(
1053
+ f"No signed URL returned from server for location: {location.id}"
1054
+ )
1009
1055
 
1010
- with tqdm(
1011
- total=file_size,
1012
- unit="B",
1013
- unit_scale=True,
1014
- unit_divisor=1024,
1015
- desc=f"Uploading {file_path.name}",
1016
- miniters=1,
1017
- mininterval=0.1,
1018
- ) as pbar:
1019
- await _aupload_file_with_progress(
1020
- data_storage_response.signed_url, file_path, pbar, file_size
1021
- )
1056
+ with tqdm(
1057
+ total=file_size,
1058
+ unit="B",
1059
+ unit_scale=True,
1060
+ unit_divisor=1024,
1061
+ desc=f"Uploading {file_path.name}",
1062
+ miniters=1,
1063
+ mininterval=0.1,
1064
+ leave=False,
1065
+ ) as pbar:
1066
+ await _aupload_file_with_progress(
1067
+ location.storage_config.signed_url, file_path, pbar, file_size
1068
+ )
1022
1069
 
1023
1070
  status_response = await self.async_client.patch(
1024
- f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
1071
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1025
1072
  json={"status": "active"},
1026
1073
  )
1027
1074
  status_response.raise_for_status()
@@ -1036,6 +1083,7 @@ class DataStorageMethods:
1036
1083
  path: str | None,
1037
1084
  parent_id: UUID | None,
1038
1085
  dataset_id: UUID | None = None,
1086
+ project_id: UUID | None = None,
1039
1087
  ) -> DataStorageResponse:
1040
1088
  """Upload a single file with a parent ID (sync version)."""
1041
1089
  file_size, text_payload = self._prepare_single_file_upload(
@@ -1046,6 +1094,7 @@ class DataStorageMethods:
1046
1094
  logger.debug("Sending file as text content with parent_id")
1047
1095
  text_payload.parent_id = parent_id
1048
1096
  text_payload.dataset_id = dataset_id
1097
+ text_payload.project_id = project_id
1049
1098
  return self._create_data_storage_entry(text_payload)
1050
1099
 
1051
1100
  logger.debug(
@@ -1058,28 +1107,30 @@ class DataStorageMethods:
1058
1107
  is_collection=False,
1059
1108
  parent_id=parent_id,
1060
1109
  dataset_id=dataset_id,
1110
+ project_id=project_id,
1061
1111
  )
1062
1112
  data_storage_response = self._create_data_storage_entry(payload)
1063
1113
 
1064
- if not data_storage_response.signed_url:
1065
- raise DataStorageCreationError("No signed URL returned from server")
1114
+ for location in data_storage_response.storage_locations:
1115
+ if not location.storage_config.signed_url:
1116
+ raise DataStorageCreationError("No signed URL returned from server")
1066
1117
 
1067
- with tqdm(
1068
- total=file_size,
1069
- unit="B",
1070
- unit_scale=True,
1071
- unit_divisor=1024,
1072
- desc=f"Uploading {file_path.name}",
1073
- miniters=1,
1074
- mininterval=0.1,
1075
- leave=False,
1076
- ) as pbar:
1077
- _upload_file_with_progress(
1078
- data_storage_response.signed_url, file_path, pbar, file_size
1079
- )
1118
+ with tqdm(
1119
+ total=file_size,
1120
+ unit="B",
1121
+ unit_scale=True,
1122
+ unit_divisor=1024,
1123
+ desc=f"Uploading {file_path.name}",
1124
+ miniters=1,
1125
+ mininterval=0.1,
1126
+ leave=False,
1127
+ ) as pbar:
1128
+ _upload_file_with_progress(
1129
+ location.storage_config.signed_url, file_path, pbar, file_size
1130
+ )
1080
1131
 
1081
1132
  status_response = self.client.patch(
1082
- f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
1133
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1083
1134
  json={"status": "active"},
1084
1135
  )
1085
1136
  status_response.raise_for_status()
@@ -1092,6 +1143,7 @@ class DataStorageMethods:
1092
1143
  dir_manifest: DirectoryManifest,
1093
1144
  current_parent_id: UUID,
1094
1145
  dataset_id: UUID | None = None,
1146
+ project_id: UUID | None = None,
1095
1147
  ) -> DataStorageResponse | None:
1096
1148
  """Process a single file item for upload."""
1097
1149
  try:
@@ -1109,6 +1161,7 @@ class DataStorageMethods:
1109
1161
  path=None,
1110
1162
  parent_id=current_parent_id,
1111
1163
  dataset_id=dataset_id,
1164
+ project_id=project_id,
1112
1165
  )
1113
1166
  except Exception as e:
1114
1167
  logger.error(f"Failed to upload file {item}: {e}")
@@ -1126,6 +1179,7 @@ class DataStorageMethods:
1126
1179
  base_dir: Path | None = None,
1127
1180
  dir_manifest: DirectoryManifest | None = None,
1128
1181
  dataset_id: UUID | None = None,
1182
+ project_id: UUID | None = None,
1129
1183
  ) -> list[DataStorageResponse]:
1130
1184
  """Upload a directory with single dataset and individual file storage entries."""
1131
1185
  responses = []
@@ -1141,6 +1195,7 @@ class DataStorageMethods:
1141
1195
  parent_id=None,
1142
1196
  dataset_id=None,
1143
1197
  is_collection=False,
1198
+ project_id=project_id,
1144
1199
  )
1145
1200
 
1146
1201
  dir_response = self._create_data_storage_entry(payload)
@@ -1182,6 +1237,7 @@ class DataStorageMethods:
1182
1237
  parent_id=current_parent_id,
1183
1238
  dataset_id=current_dataset_id,
1184
1239
  is_collection=False,
1240
+ project_id=project_id,
1185
1241
  )
1186
1242
  subdir_response = self._create_data_storage_entry(subdir_payload)
1187
1243
  responses.append(subdir_response)
@@ -1197,6 +1253,7 @@ class DataStorageMethods:
1197
1253
  base_dir=base_dir,
1198
1254
  dir_manifest=subdir_manifest,
1199
1255
  dataset_id=current_dataset_id,
1256
+ project_id=project_id,
1200
1257
  )
1201
1258
  responses.extend(subdir_responses)
1202
1259
  elif item.is_file():
@@ -1247,6 +1304,7 @@ class DataStorageMethods:
1247
1304
  path: str | None,
1248
1305
  parent_id: UUID | None,
1249
1306
  dataset_id: UUID | None = None,
1307
+ project_id: UUID | None = None,
1250
1308
  ) -> DataStorageResponse:
1251
1309
  """Asynchronously upload a single file with a parent ID."""
1252
1310
  file_size, text_payload = self._prepare_single_file_upload(
@@ -1257,6 +1315,7 @@ class DataStorageMethods:
1257
1315
  logger.debug("Sending file as text content with parent_id")
1258
1316
  text_payload.parent_id = parent_id
1259
1317
  text_payload.dataset_id = dataset_id
1318
+ text_payload.project_id = project_id
1260
1319
  return await self._acreate_data_storage_entry(text_payload)
1261
1320
 
1262
1321
  logger.debug(
@@ -1269,10 +1328,13 @@ class DataStorageMethods:
1269
1328
  is_collection=False,
1270
1329
  parent_id=parent_id,
1271
1330
  dataset_id=dataset_id,
1331
+ project_id=project_id,
1272
1332
  )
1273
1333
  data_storage_response = await self._acreate_data_storage_entry(payload)
1274
1334
 
1275
- if not data_storage_response.signed_url:
1335
+ storage_location = data_storage_response.storage_locations[0]
1336
+
1337
+ if not storage_location.storage_config.signed_url:
1276
1338
  raise DataStorageCreationError("No signed URL returned from server")
1277
1339
 
1278
1340
  with tqdm(
@@ -1285,11 +1347,11 @@ class DataStorageMethods:
1285
1347
  mininterval=0.1,
1286
1348
  ) as pbar:
1287
1349
  await _aupload_file_with_progress(
1288
- data_storage_response.signed_url, file_path, pbar, file_size
1350
+ storage_location.storage_config.signed_url, file_path, pbar, file_size
1289
1351
  )
1290
1352
 
1291
1353
  status_response = await self.async_client.patch(
1292
- f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
1354
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1293
1355
  json={"status": "active"},
1294
1356
  )
1295
1357
  status_response.raise_for_status()
@@ -1302,6 +1364,7 @@ class DataStorageMethods:
1302
1364
  dir_manifest: DirectoryManifest,
1303
1365
  current_parent_id: UUID,
1304
1366
  dataset_id: UUID | None = None,
1367
+ project_id: UUID | None = None,
1305
1368
  ) -> DataStorageResponse | None:
1306
1369
  """Asynchronously process a single file item for upload."""
1307
1370
  try:
@@ -1319,6 +1382,7 @@ class DataStorageMethods:
1319
1382
  path=None,
1320
1383
  parent_id=current_parent_id,
1321
1384
  dataset_id=dataset_id,
1385
+ project_id=project_id,
1322
1386
  )
1323
1387
  except Exception as e:
1324
1388
  logger.error(f"Failed to upload file {item}: {e}")
@@ -1336,6 +1400,7 @@ class DataStorageMethods:
1336
1400
  base_dir: Path | None = None,
1337
1401
  dir_manifest: DirectoryManifest | None = None,
1338
1402
  dataset_id: UUID | None = None,
1403
+ project_id: UUID | None = None,
1339
1404
  ) -> list[DataStorageResponse]:
1340
1405
  """Upload a directory with single dataset and individual file storage entries (async)."""
1341
1406
  responses = []
@@ -1352,6 +1417,7 @@ class DataStorageMethods:
1352
1417
  parent_id=None,
1353
1418
  dataset_id=None,
1354
1419
  is_collection=False,
1420
+ project_id=project_id,
1355
1421
  )
1356
1422
 
1357
1423
  dir_response = await self._acreate_data_storage_entry(payload)
@@ -1392,6 +1458,7 @@ class DataStorageMethods:
1392
1458
  parent_id=current_parent_id,
1393
1459
  dataset_id=current_dataset_id,
1394
1460
  is_collection=False,
1461
+ project_id=project_id,
1395
1462
  )
1396
1463
  subdir_response = await self._acreate_data_storage_entry(subdir_payload)
1397
1464
  responses.append(subdir_response)
@@ -1407,6 +1474,7 @@ class DataStorageMethods:
1407
1474
  base_dir=base_dir,
1408
1475
  dir_manifest=subdir_manifest,
1409
1476
  dataset_id=current_dataset_id,
1477
+ project_id=project_id,
1410
1478
  )
1411
1479
  responses.extend(subdir_responses)
1412
1480
  elif item.is_file():
@@ -1443,6 +1511,7 @@ class DataStorageMethods:
1443
1511
  content: str,
1444
1512
  description: str | None = None,
1445
1513
  path: str | None = None,
1514
+ project_id: UUID | None = None,
1446
1515
  ) -> DataStorageResponse:
1447
1516
  """Store content as a string in the data storage system.
1448
1517
 
@@ -1451,6 +1520,7 @@ class DataStorageMethods:
1451
1520
  content: Content to store as a string
1452
1521
  description: Optional description of the data storage entry
1453
1522
  path: Optional path for the data storage entry
1523
+ project_id: ID of the project this data storage entry belongs to
1454
1524
 
1455
1525
  Returns:
1456
1526
  DataStorageResponse containing the created data storage entry and storage locations
@@ -1464,10 +1534,11 @@ class DataStorageMethods:
1464
1534
  content=content,
1465
1535
  description=description,
1466
1536
  path=path,
1537
+ project_id=project_id,
1467
1538
  )
1468
1539
  return self._create_data_storage_entry(payload)
1469
1540
  except HTTPStatusError as e:
1470
- self._handle_http_errors(e)
1541
+ self._handle_http_errors(e, "creating")
1471
1542
  except Exception as e:
1472
1543
  raise DataStorageCreationError(
1473
1544
  f"An unexpected error occurred: {e!r}"
@@ -1486,6 +1557,7 @@ class DataStorageMethods:
1486
1557
  description: str | None = None,
1487
1558
  path: str | None = None,
1488
1559
  dataset_id: UUID | None = None,
1560
+ project_id: UUID | None = None,
1489
1561
  ) -> DataStorageResponse:
1490
1562
  """Asynchronously store content as a string in the data storage system.
1491
1563
 
@@ -1495,6 +1567,7 @@ class DataStorageMethods:
1495
1567
  description: Optional description of the data storage entry
1496
1568
  path: Optional path for the data storage entry
1497
1569
  dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1570
+ project_id: ID of the project this data storage entry belongs to
1498
1571
 
1499
1572
  Returns:
1500
1573
  DataStorageResponse containing the created data storage entry and storage locations
@@ -1509,10 +1582,11 @@ class DataStorageMethods:
1509
1582
  description=description,
1510
1583
  path=path,
1511
1584
  dataset_id=dataset_id,
1585
+ project_id=project_id,
1512
1586
  )
1513
1587
  return await self._acreate_data_storage_entry(payload)
1514
1588
  except HTTPStatusError as e:
1515
- self._handle_http_errors(e)
1589
+ self._handle_http_errors(e, "creating")
1516
1590
  except Exception as e:
1517
1591
  raise DataStorageCreationError(
1518
1592
  f"An unexpected error occurred: {e!r}"
@@ -1534,6 +1608,7 @@ class DataStorageMethods:
1534
1608
  manifest_filename: str | None = None,
1535
1609
  ignore_patterns: list[str] | None = None,
1536
1610
  ignore_filename: str = ".gitignore",
1611
+ project_id: UUID | None = None,
1537
1612
  ) -> DataStorageResponse:
1538
1613
  """Store file or directory content in the data storage system.
1539
1614
 
@@ -1552,6 +1627,7 @@ class DataStorageMethods:
1552
1627
  manifest_filename: Name of manifest file
1553
1628
  ignore_patterns: List of patterns to ignore when zipping directories
1554
1629
  ignore_filename: Name of ignore file to read from directory (default: .gitignore)
1630
+ project_id: ID of the project this data storage entry belongs to
1555
1631
 
1556
1632
  Returns:
1557
1633
  DataStorageResponse containing the final data storage entry
@@ -1564,7 +1640,13 @@ class DataStorageMethods:
1564
1640
  try:
1565
1641
  if file_path.is_dir() and as_collection:
1566
1642
  return self._upload_data_directory(
1567
- name, file_path, description, path, ignore_patterns, ignore_filename
1643
+ name,
1644
+ file_path,
1645
+ description,
1646
+ path,
1647
+ ignore_patterns,
1648
+ ignore_filename,
1649
+ project_id,
1568
1650
  )
1569
1651
  if file_path.is_dir() and not as_collection:
1570
1652
  responses = self._upload_directory_hierarchically(
@@ -1574,16 +1656,19 @@ class DataStorageMethods:
1574
1656
  manifest_filename=manifest_filename,
1575
1657
  ignore_patterns=ignore_patterns,
1576
1658
  ignore_filename=ignore_filename,
1659
+ project_id=project_id,
1577
1660
  )
1578
1661
  if not responses:
1579
1662
  raise DataStorageCreationError(
1580
1663
  "No data storage entries were created"
1581
1664
  )
1582
1665
  return responses[0]
1583
- return self._upload_data_single_file(name, file_path, description, path)
1666
+ return self._upload_data_single_file(
1667
+ name, file_path, description, path, project_id
1668
+ )
1584
1669
 
1585
1670
  except HTTPStatusError as e:
1586
- self._handle_http_errors(e)
1671
+ self._handle_http_errors(e, "creating")
1587
1672
  except Exception as e:
1588
1673
  raise DataStorageCreationError(
1589
1674
  f"An unexpected error occurred during file upload: {e!r}"
@@ -1606,6 +1691,7 @@ class DataStorageMethods:
1606
1691
  ignore_patterns: list[str] | None = None,
1607
1692
  ignore_filename: str = ".gitignore",
1608
1693
  dataset_id: UUID | None = None,
1694
+ project_id: UUID | None = None,
1609
1695
  ) -> DataStorageResponse:
1610
1696
  """Asynchronously store file or directory content in the data storage system.
1611
1697
 
@@ -1620,6 +1706,7 @@ class DataStorageMethods:
1620
1706
  ignore_patterns: List of patterns to ignore when zipping.
1621
1707
  ignore_filename: Name of ignore file to read (default: .gitignore).
1622
1708
  dataset_id: Optional dataset ID to add entry to, or None to create new dataset.
1709
+ project_id: ID of the project this data storage entry belongs to
1623
1710
 
1624
1711
  Returns:
1625
1712
  The `DataStorageResponse` for the created entry. For hierarchical uploads,
@@ -1637,6 +1724,7 @@ class DataStorageMethods:
1637
1724
  path,
1638
1725
  ignore_patterns,
1639
1726
  ignore_filename,
1727
+ project_id,
1640
1728
  )
1641
1729
  responses = await self._aupload_directory_hierarchically(
1642
1730
  name=name,
@@ -1646,6 +1734,7 @@ class DataStorageMethods:
1646
1734
  ignore_patterns=ignore_patterns,
1647
1735
  ignore_filename=ignore_filename,
1648
1736
  dataset_id=dataset_id,
1737
+ project_id=project_id,
1649
1738
  )
1650
1739
  if not responses:
1651
1740
  raise DataStorageCreationError(
@@ -1653,11 +1742,11 @@ class DataStorageMethods:
1653
1742
  )
1654
1743
  return responses[0]
1655
1744
  return await self._aupload_data_single_file(
1656
- name, file_path, description, path, dataset_id
1745
+ name, file_path, description, path, dataset_id, project_id
1657
1746
  )
1658
1747
 
1659
1748
  except HTTPStatusError as e:
1660
- self._handle_http_errors(e)
1749
+ self._handle_http_errors(e, "creating")
1661
1750
  except Exception as e:
1662
1751
  raise DataStorageCreationError(
1663
1752
  f"An unexpected error occurred during async file upload: {e!r}"
@@ -1674,7 +1763,9 @@ class DataStorageMethods:
1674
1763
  name: str,
1675
1764
  existing_location: DataStorageLocationPayload,
1676
1765
  description: str | None = None,
1766
+ as_collection: bool = False,
1677
1767
  path: str | None = None,
1768
+ project_id: UUID | None = None,
1678
1769
  ) -> DataStorageResponse:
1679
1770
  """Store content as a string in the data storage system.
1680
1771
 
@@ -1682,7 +1773,11 @@ class DataStorageMethods:
1682
1773
  name: Name of the data storage entry
1683
1774
  existing_location: Describes the existing data source location to register
1684
1775
  description: Optional description of the data storage entry
1776
+ as_collection: If uploading a directory, `True` creates a single storage entry for
1777
+ the whole directory and multiple storage locations for each file, `False` assumes
1778
+ you are uploading a single file.
1685
1779
  path: Optional path for the data storage entry
1780
+ project_id: ID of the project this data storage entry belongs to
1686
1781
 
1687
1782
  Returns:
1688
1783
  DataStorageResponse containing the created data storage entry and storage locations
@@ -1696,14 +1791,17 @@ class DataStorageMethods:
1696
1791
  description=description,
1697
1792
  path=path,
1698
1793
  existing_location=existing_location,
1794
+ project_id=project_id,
1795
+ is_collection=as_collection,
1699
1796
  )
1700
1797
  response = self.client.post(
1701
- "/v0.1/data-storage", json=payload.model_dump(exclude_none=True)
1798
+ "/v0.1/data-storage/data-entries",
1799
+ json=payload.model_dump(exclude_none=True),
1702
1800
  )
1703
1801
  response.raise_for_status()
1704
1802
  return DataStorageResponse.model_validate(response.json())
1705
1803
  except HTTPStatusError as e:
1706
- self._handle_http_errors(e)
1804
+ self._handle_http_errors(e, "creating")
1707
1805
  except Exception as e:
1708
1806
  raise DataStorageCreationError(
1709
1807
  f"An unexpected error occurred: {e!r}"
@@ -1719,8 +1817,10 @@ class DataStorageMethods:
1719
1817
  self,
1720
1818
  name: str,
1721
1819
  existing_location: DataStorageLocationPayload,
1820
+ as_collection: bool = False,
1722
1821
  description: str | None = None,
1723
1822
  path: str | None = None,
1823
+ project_id: UUID | None = None,
1724
1824
  ) -> DataStorageResponse:
1725
1825
  """Store content as a string in the data storage system.
1726
1826
 
@@ -1728,7 +1828,11 @@ class DataStorageMethods:
1728
1828
  name: Name of the data storage entry
1729
1829
  existing_location: Describes the existing data source location to register
1730
1830
  description: Optional description of the data storage entry
1831
+ as_collection: If uploading a directory, `True` creates a single storage entry for
1832
+ the whole directory and multiple storage locations for each file, `False` assumes
1833
+ you are uploading a single file.
1731
1834
  path: Optional path for the data storage entry
1835
+ project_id: ID of the project this data storage entry belongs to
1732
1836
 
1733
1837
  Returns:
1734
1838
  DataStorageResponse containing the created data storage entry and storage locations
@@ -1742,19 +1846,290 @@ class DataStorageMethods:
1742
1846
  description=description,
1743
1847
  path=path,
1744
1848
  existing_location=existing_location,
1849
+ project_id=project_id,
1850
+ is_collection=as_collection,
1745
1851
  )
1746
1852
  response = await self.async_client.post(
1747
- "/v0.1/data-storage", json=payload.model_dump(exclude_none=True)
1853
+ "/v0.1/data-storage/data-entries",
1854
+ json=payload.model_dump(exclude_none=True),
1748
1855
  )
1749
1856
  response.raise_for_status()
1750
1857
  return DataStorageResponse.model_validate(response.json())
1751
1858
  except HTTPStatusError as e:
1752
- self._handle_http_errors(e)
1859
+ self._handle_http_errors(e, "creating")
1753
1860
  except Exception as e:
1754
1861
  raise DataStorageCreationError(
1755
1862
  f"An unexpected error occurred: {e!r}"
1756
1863
  ) from e
1757
1864
 
1865
+ @retry(
1866
+ stop=stop_after_attempt(3),
1867
+ wait=wait_exponential(multiplier=1, max=10),
1868
+ retry=retry_if_connection_error,
1869
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1870
+ )
1871
+ def search_data_storage(
1872
+ self,
1873
+ criteria: list[SearchCriterion] | None = None,
1874
+ size: int = 10,
1875
+ ) -> list[dict]:
1876
+ """Search data storage objects using structured criteria.
1877
+
1878
+ Args:
1879
+ criteria: List of search criteria (SearchCriterion objects with field, operator, value)
1880
+ size: Number of results to return (1-100)
1881
+
1882
+ Returns:
1883
+ List of search results with scores and data storage information
1884
+
1885
+ Raises:
1886
+ DataStorageCreationError: If there's an error searching data storage entries
1887
+
1888
+ Example:
1889
+ from futurehouse_client.models.rest import SearchCriterion, SearchOperator
1890
+ criteria = [
1891
+ SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
1892
+ SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
1893
+ SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
1894
+ ]
1895
+ results = client.search_data_storage(criteria=criteria, size=20)
1896
+ """
1897
+ try:
1898
+ payload = DataStorageSearchPayload(
1899
+ criteria=criteria or [],
1900
+ size=max(1, min(100, size)), # Clamp between 1-100
1901
+ )
1902
+
1903
+ response = self.client.post(
1904
+ "/v0.1/data-storage/search",
1905
+ json=payload.model_dump(mode="json"),
1906
+ )
1907
+ response.raise_for_status()
1908
+ return response.json()
1909
+
1910
+ except HTTPStatusError as e:
1911
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
1912
+ raise DataStorageCreationError(
1913
+ "Search functionality is currently unavailable"
1914
+ ) from e
1915
+ self._handle_http_errors(e, "searching")
1916
+ except Exception as e:
1917
+ raise DataStorageCreationError(
1918
+ f"An unexpected error occurred during search: {e!r}"
1919
+ ) from e
1920
+
1921
+ @retry(
1922
+ stop=stop_after_attempt(3),
1923
+ wait=wait_exponential(multiplier=1, max=10),
1924
+ retry=retry_if_connection_error,
1925
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1926
+ )
1927
+ async def asearch_data_storage(
1928
+ self,
1929
+ criteria: list[SearchCriterion] | None = None,
1930
+ size: int = 10,
1931
+ ) -> list[dict]:
1932
+ """Asynchronously search data storage objects using structured criteria.
1933
+
1934
+ Args:
1935
+ criteria: List of search criteria (SearchCriterion objects with field, operator, value)
1936
+ size: Number of results to return (1-100)
1937
+
1938
+ Returns:
1939
+ List of search results with scores and data storage information
1940
+
1941
+ Raises:
1942
+ DataStorageCreationError: If there's an error searching data storage entries
1943
+
1944
+ Example:
1945
+ from futurehouse_client.models.rest import SearchCriterion, SearchOperator
1946
+ criteria = [
1947
+ SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
1948
+ SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
1949
+ SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
1950
+ ]
1951
+ results = await client.asearch_data_storage(criteria=criteria, size=20)
1952
+ """
1953
+ try:
1954
+ payload = DataStorageSearchPayload(
1955
+ criteria=criteria or [],
1956
+ size=max(1, min(100, size)), # Clamp between 1-100
1957
+ )
1958
+
1959
+ response = await self.async_client.post(
1960
+ "/v0.1/data-storage/search",
1961
+ json=payload.model_dump(mode="json"),
1962
+ )
1963
+ response.raise_for_status()
1964
+ return response.json()
1965
+
1966
+ except HTTPStatusError as e:
1967
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
1968
+ raise DataStorageCreationError(
1969
+ "Search functionality is currently unavailable"
1970
+ ) from e
1971
+ self._handle_http_errors(e, "searching")
1972
+ except Exception as e:
1973
+ raise DataStorageCreationError(
1974
+ f"An unexpected error occurred during async search: {e!r}"
1975
+ ) from e
1976
+
1977
+ @retry(
1978
+ stop=stop_after_attempt(3),
1979
+ wait=wait_exponential(multiplier=1, max=10),
1980
+ retry=retry_if_connection_error,
1981
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1982
+ )
1983
+ def similarity_search_data_storage(
1984
+ self,
1985
+ embedding: list[float],
1986
+ size: int = 10,
1987
+ min_score: float = 0.7,
1988
+ dataset_id: UUID | None = None,
1989
+ tags: list[str] | None = None,
1990
+ user_id: str | None = None,
1991
+ project_id: str | None = None,
1992
+ ) -> list[dict]:
1993
+ """Search data storage objects using vector similarity.
1994
+
1995
+ Args:
1996
+ embedding: Embedding vector for similarity search
1997
+ size: Number of results to return (1-100)
1998
+ min_score: Minimum similarity score (0.0-1.0)
1999
+ dataset_id: Optional dataset ID filter
2000
+ tags: Optional list of tags to filter by
2001
+ user_id: Optional user ID filter (admin only)
2002
+ project_id: Optional project ID filter
2003
+
2004
+ Returns:
2005
+ List of search results with similarity scores and data storage information
2006
+
2007
+ Raises:
2008
+ DataStorageCreationError: If there's an error performing similarity search
2009
+ """
2010
+ try:
2011
+ # Validate inputs
2012
+ if not embedding:
2013
+ raise DataStorageCreationError("Embedding vector is required")
2014
+
2015
+ if not all(isinstance(x, int | float) for x in embedding):
2016
+ raise DataStorageCreationError("Embedding must be a list of numbers")
2017
+
2018
+ size = max(1, min(100, size)) # Clamp between 1-100
2019
+ min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
2020
+
2021
+ # Build request payload
2022
+ payload = {
2023
+ "embedding": embedding,
2024
+ "size": size,
2025
+ "min_score": min_score,
2026
+ }
2027
+
2028
+ # Add optional filters
2029
+ if dataset_id is not None:
2030
+ payload["dataset_id"] = str(dataset_id)
2031
+ if tags is not None:
2032
+ payload["tags"] = tags
2033
+ if user_id is not None:
2034
+ payload["user_id"] = user_id
2035
+ if project_id is not None:
2036
+ payload["project_id"] = project_id
2037
+
2038
+ response = self.client.post(
2039
+ "/v0.1/data-storage/similarity-search", json=payload
2040
+ )
2041
+ response.raise_for_status()
2042
+ return response.json()
2043
+
2044
+ except HTTPStatusError as e:
2045
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2046
+ raise DataStorageCreationError(
2047
+ "Similarity search functionality is currently unavailable"
2048
+ ) from e
2049
+ self._handle_http_errors(e, "performing similarity search")
2050
+ except Exception as e:
2051
+ raise DataStorageCreationError(
2052
+ f"An unexpected error occurred during similarity search: {e!r}"
2053
+ ) from e
2054
+
2055
+ @retry(
2056
+ stop=stop_after_attempt(3),
2057
+ wait=wait_exponential(multiplier=1, max=10),
2058
+ retry=retry_if_connection_error,
2059
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2060
+ )
2061
+ async def asimilarity_search_data_storage(
2062
+ self,
2063
+ embedding: list[float],
2064
+ size: int = 10,
2065
+ min_score: float = 0.7,
2066
+ dataset_id: UUID | None = None,
2067
+ tags: list[str] | None = None,
2068
+ user_id: str | None = None,
2069
+ project_id: str | None = None,
2070
+ ) -> list[dict]:
2071
+ """Asynchronously search data storage objects using vector similarity.
2072
+
2073
+ Args:
2074
+ embedding: Embedding vector for similarity search
2075
+ size: Number of results to return (1-100)
2076
+ min_score: Minimum similarity score (0.0-1.0)
2077
+ dataset_id: Optional dataset ID filter
2078
+ tags: Optional list of tags to filter by
2079
+ user_id: Optional user ID filter (admin only)
2080
+ project_id: Optional project ID filter
2081
+
2082
+ Returns:
2083
+ List of search results with similarity scores and data storage information
2084
+
2085
+ Raises:
2086
+ DataStorageCreationError: If there's an error performing similarity search
2087
+ """
2088
+ try:
2089
+ # Validate inputs
2090
+ if not embedding:
2091
+ raise DataStorageCreationError("Embedding vector is required")
2092
+
2093
+ if not all(isinstance(x, int | float) for x in embedding):
2094
+ raise DataStorageCreationError("Embedding must be a list of numbers")
2095
+
2096
+ size = max(1, min(100, size)) # Clamp between 1-100
2097
+ min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
2098
+
2099
+ # Build request payload
2100
+ payload = {
2101
+ "embedding": embedding,
2102
+ "size": size,
2103
+ "min_score": min_score,
2104
+ }
2105
+
2106
+ # Add optional filters
2107
+ if dataset_id is not None:
2108
+ payload["dataset_id"] = str(dataset_id)
2109
+ if tags is not None:
2110
+ payload["tags"] = tags
2111
+ if user_id is not None:
2112
+ payload["user_id"] = user_id
2113
+ if project_id is not None:
2114
+ payload["project_id"] = project_id
2115
+
2116
+ response = await self.async_client.post(
2117
+ "/v0.1/data-storage/similarity-search", json=payload
2118
+ )
2119
+ response.raise_for_status()
2120
+ return response.json()
2121
+
2122
+ except HTTPStatusError as e:
2123
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2124
+ raise DataStorageCreationError(
2125
+ "Similarity search functionality is currently unavailable"
2126
+ ) from e
2127
+ self._handle_http_errors(e, "performing similarity search")
2128
+ except Exception as e:
2129
+ raise DataStorageCreationError(
2130
+ f"An unexpected error occurred during async similarity search: {e!r}"
2131
+ ) from e
2132
+
1758
2133
  # TODO: EVERYTHING BELOW THIS LINE SHOULD BE MOVED TO FH_TOOLS REPO
1759
2134
  # =================================================
1760
2135
  @retry(
@@ -1766,7 +2141,7 @@ class DataStorageMethods:
1766
2141
  def fetch_data_from_storage(
1767
2142
  self,
1768
2143
  data_storage_id: UUID | None = None,
1769
- ) -> str | Path | None:
2144
+ ) -> str | Path | list[Path] | None:
1770
2145
  """Fetch data from the storage system (sync version).
1771
2146
 
1772
2147
  Args:
@@ -1775,29 +2150,45 @@ class DataStorageMethods:
1775
2150
  Returns:
1776
2151
  For PG_TABLE storage: string content
1777
2152
  For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
2153
+ For multi-location entries: dict of location IDs to dicts with signed URL and file name
1778
2154
  None if not found or error occurred
1779
2155
  """
1780
2156
  if not data_storage_id:
1781
- raise DataStorageCreationError(
2157
+ raise DataStorageRetrievalError(
1782
2158
  "data_storage_id must be provided at this time"
1783
2159
  )
1784
2160
 
1785
2161
  try:
1786
- response = self.client.get(f"/v0.1/data-storage/{data_storage_id}")
2162
+ response = self.client.get(
2163
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
2164
+ )
1787
2165
  response.raise_for_status()
1788
2166
  result = DataStorageResponse.model_validate(response.json())
1789
2167
 
1790
- storage_type = result.storage_location.storage_config.storage_type
2168
+ if len(result.storage_locations) > 1:
2169
+ return [
2170
+ self._download_from_gcs(
2171
+ location.storage_config.signed_url or "",
2172
+ (location.storage_config.location or "").split("/")[-1],
2173
+ )
2174
+ for location in result.storage_locations
2175
+ ]
2176
+
2177
+ # Most scenarios will only have one location
2178
+ storage_location = result.storage_locations[0]
2179
+ storage_type = storage_location.storage_config.storage_type
1791
2180
 
1792
2181
  if storage_type == "gcs":
1793
- if not result.signed_url:
1794
- raise DataStorageCreationError(
2182
+ if not storage_location.storage_config.signed_url:
2183
+ raise DataStorageRetrievalError(
1795
2184
  "No signed URL available for GCS download"
1796
2185
  )
1797
2186
 
1798
- return self._download_from_gcs(result.signed_url)
2187
+ return self._download_from_gcs(
2188
+ storage_location.storage_config.signed_url
2189
+ )
1799
2190
 
1800
- if storage_type == "raw_content":
2191
+ if storage_type in {"raw_content", "pg_table"}:
1801
2192
  content = result.data_storage.content
1802
2193
  if content is None:
1803
2194
  logger.warning(
@@ -1806,12 +2197,12 @@ class DataStorageMethods:
1806
2197
  return None
1807
2198
  return content
1808
2199
 
1809
- raise DataStorageCreationError(f"Unsupported storage type: {storage_type}")
2200
+ raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
1810
2201
 
1811
2202
  except HTTPStatusError as e:
1812
- self._handle_http_errors(e)
2203
+ self._handle_http_errors(e, "retrieving")
1813
2204
  except Exception as e:
1814
- raise DataStorageCreationError(
2205
+ raise DataStorageRetrievalError(
1815
2206
  f"An unexpected error occurred: {e!r}"
1816
2207
  ) from e
1817
2208
 
@@ -1824,7 +2215,7 @@ class DataStorageMethods:
1824
2215
  async def afetch_data_from_storage(
1825
2216
  self,
1826
2217
  data_storage_id: UUID | None = None,
1827
- ) -> str | Path | None:
2218
+ ) -> str | Path | list[Path] | None:
1828
2219
  """Fetch data from the storage system.
1829
2220
 
1830
2221
  Args:
@@ -1833,31 +2224,48 @@ class DataStorageMethods:
1833
2224
  Returns:
1834
2225
  For PG_TABLE storage: string content
1835
2226
  For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
2227
+ For multi-location entries: dict of location IDs to dicts with signed URL and file name
1836
2228
  None if not found or error occurred
1837
2229
  """
1838
2230
  if not data_storage_id:
1839
- raise DataStorageCreationError(
2231
+ raise DataStorageRetrievalError(
1840
2232
  "data_storage_id must be provided at this time"
1841
2233
  )
1842
2234
 
1843
2235
  try:
1844
2236
  response = await self.async_client.get(
1845
- f"/v0.1/data-storage/{data_storage_id}"
2237
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
1846
2238
  )
1847
2239
  response.raise_for_status()
1848
2240
  result = DataStorageResponse.model_validate(response.json())
1849
2241
 
1850
- storage_type = result.storage_location.storage_config.storage_type
2242
+ if len(result.storage_locations) > 1:
2243
+ return await gather_with_concurrency(
2244
+ DOWNLOAD_CONCURRENCY,
2245
+ [
2246
+ self._adownload_from_gcs(
2247
+ location.storage_config.signed_url or "",
2248
+ (location.storage_config.location or "").split("/")[-1],
2249
+ )
2250
+ for location in result.storage_locations
2251
+ ],
2252
+ )
2253
+
2254
+ # Most scenarios will only have one location
2255
+ storage_location = result.storage_locations[0]
2256
+ storage_type = storage_location.storage_config.storage_type
1851
2257
 
1852
2258
  if storage_type == "gcs":
1853
- if not result.signed_url:
1854
- raise DataStorageCreationError(
2259
+ if not storage_location.storage_config.signed_url:
2260
+ raise DataStorageRetrievalError(
1855
2261
  "No signed URL available for GCS download"
1856
2262
  )
1857
2263
 
1858
- return await self._adownload_from_gcs(result.signed_url)
2264
+ return await self._adownload_from_gcs(
2265
+ storage_location.storage_config.signed_url
2266
+ )
1859
2267
 
1860
- if storage_type == "raw_content":
2268
+ if storage_type in {"raw_content", "pg_table"}:
1861
2269
  content = result.data_storage.content
1862
2270
  if content is None:
1863
2271
  logger.warning(
@@ -1866,11 +2274,189 @@ class DataStorageMethods:
1866
2274
  return None
1867
2275
  return content
1868
2276
 
1869
- raise DataStorageCreationError(f"Unsupported storage type: {storage_type}")
2277
+ raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
1870
2278
 
1871
2279
  except HTTPStatusError as e:
1872
- self._handle_http_errors(e)
2280
+ self._handle_http_errors(e, "retrieving")
2281
+ except Exception as e:
2282
+ raise DataStorageRetrievalError(
2283
+ f"An unexpected error occurred: {e!r}"
2284
+ ) from e
2285
+
2286
+ @retry(
2287
+ stop=stop_after_attempt(3),
2288
+ wait=wait_exponential(multiplier=1, max=10),
2289
+ retry=retry_if_connection_error,
2290
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2291
+ )
2292
+ async def acreate_dataset(
2293
+ self,
2294
+ name: str,
2295
+ description: str | None = None,
2296
+ dataset_id: UUID | None = None,
2297
+ ):
2298
+ try:
2299
+ payload = CreateDatasetPayload(
2300
+ name=name,
2301
+ description=description,
2302
+ id=dataset_id,
2303
+ )
2304
+ response = await self.async_client.post(
2305
+ "/v0.1/data-storage/datasets",
2306
+ json=payload.model_dump(exclude_none=True),
2307
+ )
2308
+ response.raise_for_status()
2309
+ return CreateDatasetPayload.model_validate(response.json())
2310
+ except HTTPStatusError as e:
2311
+ self._handle_http_errors(e, "creating")
2312
+ except Exception as e:
2313
+ raise DataStorageCreationError(
2314
+ f"An unexpected error occurred: {e!r}"
2315
+ ) from e
2316
+
2317
+ @retry(
2318
+ stop=stop_after_attempt(3),
2319
+ wait=wait_exponential(multiplier=1, max=10),
2320
+ retry=retry_if_connection_error,
2321
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2322
+ )
2323
+ def create_dataset(
2324
+ self,
2325
+ name: str,
2326
+ description: str | None = None,
2327
+ dataset_id: UUID | None = None,
2328
+ ):
2329
+ try:
2330
+ payload = CreateDatasetPayload(
2331
+ name=name,
2332
+ description=description,
2333
+ id=dataset_id,
2334
+ )
2335
+ response = self.client.post(
2336
+ "/v0.1/data-storage/datasets",
2337
+ json=payload.model_dump(exclude_none=True),
2338
+ )
2339
+ response.raise_for_status()
2340
+ return CreateDatasetPayload.model_validate(response.json())
2341
+ except HTTPStatusError as e:
2342
+ self._handle_http_errors(e, "creating")
1873
2343
  except Exception as e:
1874
2344
  raise DataStorageCreationError(
1875
2345
  f"An unexpected error occurred: {e!r}"
1876
2346
  ) from e
2347
+
2348
+ @retry(
2349
+ stop=stop_after_attempt(3),
2350
+ wait=wait_exponential(multiplier=1, max=10),
2351
+ retry=retry_if_connection_error,
2352
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2353
+ )
2354
+ async def adelete_dataset(self, dataset_id: UUID):
2355
+ """Delete a dataset.
2356
+
2357
+ Note: This will delete all data storage entries associated with the dataset.
2358
+
2359
+ Args:
2360
+ dataset_id: ID of the dataset to delete
2361
+
2362
+ Raises:
2363
+ DataStorageError: If there's an error deleting the dataset
2364
+ """
2365
+ try:
2366
+ await self.async_client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
2367
+ except HTTPStatusError as e:
2368
+ self._handle_http_errors(e, "deleting")
2369
+ except Exception as e:
2370
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2371
+
2372
+ @retry(
2373
+ stop=stop_after_attempt(3),
2374
+ wait=wait_exponential(multiplier=1, max=10),
2375
+ retry=retry_if_connection_error,
2376
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2377
+ )
2378
+ def delete_dataset(self, dataset_id: UUID):
2379
+ """Delete a dataset.
2380
+
2381
+ Note: This will delete all data storage entries associated with the dataset.
2382
+
2383
+ Args:
2384
+ dataset_id: ID of the dataset to delete
2385
+
2386
+ Raises:
2387
+ DataStorageError: If there's an error deleting the dataset
2388
+ """
2389
+ try:
2390
+ self.client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
2391
+ except HTTPStatusError as e:
2392
+ self._handle_http_errors(e, "deleting")
2393
+ except Exception as e:
2394
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2395
+
2396
+ @retry(
2397
+ stop=stop_after_attempt(3),
2398
+ wait=wait_exponential(multiplier=1, max=10),
2399
+ retry=retry_if_connection_error,
2400
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2401
+ )
2402
+ async def aget_dataset(self, dataset_id: UUID):
2403
+ try:
2404
+ response = await self.async_client.get(
2405
+ f"/v0.1/data-storage/datasets/{dataset_id}"
2406
+ )
2407
+ response.raise_for_status()
2408
+
2409
+ return response.json()
2410
+ except HTTPStatusError as e:
2411
+ self._handle_http_errors(e, "retrieving")
2412
+ except Exception as e:
2413
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2414
+
2415
+ @retry(
2416
+ stop=stop_after_attempt(3),
2417
+ wait=wait_exponential(multiplier=1, max=10),
2418
+ retry=retry_if_connection_error,
2419
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2420
+ )
2421
+ def get_dataset(self, dataset_id: UUID):
2422
+ try:
2423
+ response = self.client.get(f"/v0.1/data-storage/datasets/{dataset_id}")
2424
+ response.raise_for_status()
2425
+
2426
+ return response.json()
2427
+ except HTTPStatusError as e:
2428
+ self._handle_http_errors(e, "retrieving")
2429
+ except Exception as e:
2430
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2431
+
2432
+ @retry(
2433
+ stop=stop_after_attempt(3),
2434
+ wait=wait_exponential(multiplier=1, max=10),
2435
+ retry=retry_if_connection_error,
2436
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2437
+ )
2438
+ async def adelete_data_storage_entry(self, data_storage_entry_id: UUID):
2439
+ try:
2440
+ await self.async_client.delete(
2441
+ f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
2442
+ )
2443
+ except HTTPStatusError as e:
2444
+ self._handle_http_errors(e, "deleting")
2445
+ except Exception as e:
2446
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2447
+
2448
+ @retry(
2449
+ stop=stop_after_attempt(3),
2450
+ wait=wait_exponential(multiplier=1, max=10),
2451
+ retry=retry_if_connection_error,
2452
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2453
+ )
2454
+ def delete_data_storage_entry(self, data_storage_entry_id: UUID):
2455
+ try:
2456
+ self.client.delete(
2457
+ f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
2458
+ )
2459
+ except HTTPStatusError as e:
2460
+ self._handle_http_errors(e, "deleting")
2461
+ except Exception as e:
2462
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e