futurehouse-client 0.4.2.dev11__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2649 @@
1
+ import asyncio
2
+ import fnmatch
3
+ import json
4
+ import logging
5
+ import shutil
6
+ import tempfile
7
+ import zipfile
8
+ from os import PathLike
9
+ from pathlib import Path
10
+ from typing import Any, NoReturn
11
+ from uuid import UUID
12
+
13
+ import aiofiles
14
+ import aiohttp
15
+ import requests as requests_lib
16
+ from google.resumable_media import requests as resumable_requests
17
+ from httpx import AsyncClient, Client, HTTPStatusError, codes
18
+ from lmi.utils import gather_with_concurrency
19
+ from pydantic import HttpUrl
20
+ from requests.adapters import HTTPAdapter
21
+ from tenacity import (
22
+ before_sleep_log,
23
+ retry,
24
+ stop_after_attempt,
25
+ wait_exponential,
26
+ )
27
+ from tqdm import tqdm
28
+ from urllib3.util.retry import Retry
29
+
30
+ from futurehouse_client.models.data_storage_methods import (
31
+ CreateDatasetPayload,
32
+ DataContentType,
33
+ DataStorageLocationPayload,
34
+ DataStorageRequestPayload,
35
+ DataStorageResponse,
36
+ DataStorageType,
37
+ DirectoryManifest,
38
+ ManifestEntry,
39
+ )
40
+ from futurehouse_client.models.rest import (
41
+ DataStorageSearchPayload,
42
+ SearchCriterion,
43
+ )
44
+ from futurehouse_client.utils.general import retry_if_connection_error
45
+
46
+ # this is only required if they're using a yaml manifest
47
+ try:
48
+ import yaml
49
+ except ImportError:
50
+ yaml = None # type: ignore[assignment]
51
+
52
+
53
+ logger = logging.getLogger(__name__)
54
+
55
+ # TODO: pdf support, unsure what package we want to use
56
+ SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT = ["txt", "md", "csv", "json", "yaml", "yml"]
57
+ CHUNK_SIZE = 8 * 1024 * 1024 # 8MB
58
+ MAX_RETRIES = 3
59
+ SMALL_FILE_THRESHOLD_BYTES = 10 * 1024 * 1024 # 10MB
60
+ HTTP_RESUME_INCOMPLETE = 308
61
+ INITIATE_HEADERS = {
62
+ "Content-Type": "application/octet-stream",
63
+ "x-goog-resumable": "start",
64
+ "Content-Length": "0",
65
+ }
66
+ DOWNLOAD_CONCURRENCY = 3
67
+
68
+
69
+ def _should_ignore_file(
70
+ file_path: Path | PathLike,
71
+ base_path: Path | PathLike,
72
+ ignore_patterns: list[str] | None = None,
73
+ ) -> bool:
74
+ """Check if a file should be ignored based on ignore patterns.
75
+
76
+ Args:
77
+ file_path: Path to the file to check
78
+ base_path: Base directory path
79
+ ignore_patterns: List of ignore patterns (supports gitignore-style patterns)
80
+
81
+ Returns:
82
+ True if file should be ignored
83
+ """
84
+ if not ignore_patterns:
85
+ return False
86
+
87
+ try:
88
+ file_path = Path(file_path)
89
+ base_path = Path(base_path)
90
+ rel_path = file_path.relative_to(base_path)
91
+ rel_path_str = str(rel_path)
92
+
93
+ for pattern in ignore_patterns:
94
+ pattern = pattern.strip()
95
+ if not pattern or pattern.startswith("#"):
96
+ continue
97
+
98
+ is_absolute_match = pattern.startswith("/") and rel_path_str.startswith(
99
+ pattern[1:]
100
+ )
101
+ is_nested_match = "/" in pattern and pattern in rel_path_str
102
+ is_name_match = fnmatch.fnmatch(file_path.name, pattern)
103
+ is_part_match = pattern in rel_path.parts
104
+
105
+ if is_absolute_match or is_nested_match or is_name_match or is_part_match:
106
+ return True
107
+
108
+ except ValueError:
109
+ pass
110
+
111
+ return False
112
+
113
+
114
+ def _read_ignore_file(dir_path: Path, ignore_filename: str = ".gitignore") -> list[str]:
115
+ """Read ignore patterns from a file in the directory.
116
+
117
+ Args:
118
+ dir_path: Directory to look for ignore file
119
+ ignore_filename: Name of ignore file to read
120
+
121
+ Returns:
122
+ List of ignore patterns
123
+ """
124
+ ignore_file = dir_path / ignore_filename
125
+ if ignore_file.exists():
126
+ try:
127
+ with open(ignore_file, encoding="utf-8") as f:
128
+ return [line.strip() for line in f]
129
+ except Exception as e:
130
+ logger.warning(f"Failed to read {ignore_filename}: {e}")
131
+ return []
132
+ else:
133
+ return []
134
+
135
+
136
+ def _collect_ignore_patterns(
137
+ dir_path: Path,
138
+ ignore_patterns: list[str] | None = None,
139
+ ignore_filename: str = ".gitignore",
140
+ ) -> list[str]:
141
+ """Collect all ignore patterns from multiple sources.
142
+
143
+ Args:
144
+ dir_path: Directory to check for ignore files
145
+ ignore_patterns: Explicit ignore patterns
146
+ ignore_filename: Name of ignore file to read from directory
147
+
148
+ Returns:
149
+ Combined list of ignore patterns
150
+ """
151
+ all_ignore_patterns = ignore_patterns or []
152
+ file_patterns = _read_ignore_file(dir_path, ignore_filename)
153
+ all_ignore_patterns.extend(file_patterns)
154
+
155
+ default_ignores = [".git", "__pycache__", "*.pyc", ".DS_Store", "node_modules"]
156
+ all_ignore_patterns.extend(default_ignores)
157
+
158
+ return all_ignore_patterns
159
+
160
+
161
+ def _create_directory_zip(
162
+ dir_path: Path,
163
+ zip_path: Path,
164
+ ignore_patterns: list[str] | None = None,
165
+ ignore_filename: str = ".gitignore",
166
+ ) -> int:
167
+ """Create a zip file from a directory with ignore patterns.
168
+
169
+ Args:
170
+ dir_path: Directory to zip
171
+ zip_path: Output zip file path
172
+ ignore_patterns: Explicit ignore patterns
173
+ ignore_filename: Name of ignore file to read from directory
174
+
175
+ Returns:
176
+ Size of created zip file in bytes
177
+ """
178
+ all_ignore_patterns = _collect_ignore_patterns(
179
+ dir_path, ignore_patterns, ignore_filename
180
+ )
181
+
182
+ logger.debug(f"Creating zip with ignore patterns: {all_ignore_patterns}")
183
+
184
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
185
+ for file_path in dir_path.rglob("*"):
186
+ if file_path.is_file() and not _should_ignore_file(
187
+ file_path, dir_path, all_ignore_patterns
188
+ ):
189
+ arcname = file_path.relative_to(dir_path)
190
+ zipf.write(file_path, arcname)
191
+ logger.debug(f"Added to zip: {arcname}")
192
+
193
+ zip_size = zip_path.stat().st_size
194
+ logger.debug(f"Created zip file {zip_path} with size {zip_size:,} bytes")
195
+ return zip_size
196
+
197
+
198
+ def _should_send_as_text_content(file_path: Path, file_size: int) -> bool:
199
+ """Check if a file should be sent as text content instead of file upload.
200
+
201
+ Args:
202
+ file_path: Path to the file
203
+ file_size: Size of file in bytes
204
+
205
+ Returns:
206
+ True if file should be sent as text content
207
+ """
208
+ # small files can be treated as raw text
209
+ if file_size >= SMALL_FILE_THRESHOLD_BYTES:
210
+ return False
211
+
212
+ file_extension = file_path.suffix.lower().lstrip(".")
213
+ return file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT
214
+
215
+
216
+ def _extract_text_from_file(file_path: Path) -> str | None:
217
+ """Extract text content from a file.
218
+
219
+ Args:
220
+ file_path: Path to the file
221
+
222
+ Returns:
223
+ Extracted text content or None if extraction failed
224
+ """
225
+ file_extension = file_path.suffix.lower().lstrip(".")
226
+
227
+ if file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT:
228
+ try:
229
+ return file_path.read_text(encoding="utf-8")
230
+ except Exception as e:
231
+ logger.warning(f"Failed to extract text from {file_path}: {e}")
232
+ return None
233
+ else:
234
+ return None
235
+
236
+
237
+ def _setup_upload_progress(file_path: Path, file_size: int, progress_bar: tqdm) -> None:
238
+ """Common setup for upload progress tracking."""
239
+ logger.debug(
240
+ f"Starting resumable upload for file: {file_path} (size: {file_size:,} bytes)"
241
+ )
242
+ progress_bar.set_description(f"Uploading {file_path.name}")
243
+ progress_bar.refresh()
244
+
245
+
246
+ async def _initiate_resumable_session(
247
+ session: aiohttp.ClientSession, signed_url: str
248
+ ) -> str:
249
+ """Initiate resumable upload session and return session URI."""
250
+ logger.debug("Initiating resumable upload session")
251
+ async with session.post(signed_url, headers=INITIATE_HEADERS) as initiate_response:
252
+ if initiate_response.status not in {200, 201}:
253
+ error_text = await initiate_response.text()
254
+ logger.error(
255
+ f"Failed to initiate resumable session: {initiate_response.status}"
256
+ )
257
+ logger.error(f"Response: {error_text}")
258
+ initiate_response.raise_for_status()
259
+
260
+ return _validate_session_uri(initiate_response.headers.get("location"))
261
+
262
+
263
+ # TODO: temp
264
+ def _log_upload_debug(signed_url: str) -> None:
265
+ """Common debug logging for uploads."""
266
+ logger.debug(f"Signed URL: {signed_url[:100]}...")
267
+
268
+
269
+ # TODO: temp
270
+ def _validate_session_uri(session_uri: str | None) -> str:
271
+ """Validate and return session URI or raise exception."""
272
+ if not session_uri:
273
+ raise DataStorageError(
274
+ "No session URI returned from resumable upload initiation"
275
+ )
276
+ logger.debug(f"Resumable session initiated. Session URI: {session_uri[:100]}...")
277
+ return session_uri
278
+
279
+
280
+ async def _upload_chunk_with_retry(
281
+ session: aiohttp.ClientSession,
282
+ session_uri: str,
283
+ chunk_data: bytes,
284
+ range_start: int,
285
+ file_size: int,
286
+ progress_bar: tqdm,
287
+ ) -> int:
288
+ """Upload a single chunk with retry logic."""
289
+ range_end = range_start + len(chunk_data) - 1
290
+ chunk_headers = {
291
+ "Content-Type": "application/octet-stream",
292
+ "Content-Length": str(len(chunk_data)),
293
+ "Content-Range": f"bytes {range_start}-{range_end}/{file_size}",
294
+ }
295
+
296
+ for attempt in range(MAX_RETRIES):
297
+ try:
298
+ async with session.put(
299
+ session_uri, data=chunk_data, headers=chunk_headers
300
+ ) as chunk_response:
301
+ if chunk_response.status == HTTP_RESUME_INCOMPLETE:
302
+ progress_bar.update(len(chunk_data))
303
+ logger.debug(f"Uploaded chunk: {range_end + 1}/{file_size} bytes")
304
+ return len(chunk_data)
305
+ if chunk_response.status in {200, 201}:
306
+ progress_bar.update(len(chunk_data))
307
+ logger.debug(
308
+ f"Upload completed successfully. Final response: {chunk_response.status}"
309
+ )
310
+ return len(chunk_data)
311
+
312
+ error_text = await chunk_response.text()
313
+ logger.warning(
314
+ f"Chunk upload failed (attempt {attempt + 1}/{MAX_RETRIES}): {chunk_response.status}"
315
+ )
316
+ logger.warning(f"Response: {error_text}")
317
+ if attempt == MAX_RETRIES - 1:
318
+ chunk_response.raise_for_status()
319
+
320
+ except (TimeoutError, aiohttp.ClientError) as e:
321
+ logger.warning(
322
+ f"Chunk upload error (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
323
+ )
324
+ if attempt == MAX_RETRIES - 1:
325
+ raise
326
+ await asyncio.sleep(2**attempt)
327
+
328
+ return 0
329
+
330
+
331
+ async def _aupload_file_with_progress(
332
+ signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
333
+ ) -> None:
334
+ """Upload a file asynchronously using aiohttp with signed URL initiation."""
335
+ _setup_upload_progress(file_path, file_size, progress_bar)
336
+ _log_upload_debug(signed_url)
337
+
338
+ try:
339
+ retry_config = aiohttp.ClientTimeout(
340
+ total=max(600.0, file_size / (512 * 1024)), connect=30, sock_read=30
341
+ )
342
+ connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
343
+
344
+ async with aiohttp.ClientSession(
345
+ connector=connector, timeout=retry_config
346
+ ) as session:
347
+ session_uri = await _initiate_resumable_session(session, signed_url)
348
+
349
+ async with aiofiles.open(file_path, "rb") as file_obj:
350
+ bytes_uploaded = 0
351
+
352
+ while bytes_uploaded < file_size:
353
+ remaining = file_size - bytes_uploaded
354
+ current_chunk_size = min(CHUNK_SIZE, remaining)
355
+ chunk_data = await file_obj.read(current_chunk_size)
356
+
357
+ if not chunk_data:
358
+ break
359
+
360
+ uploaded_bytes = await _upload_chunk_with_retry(
361
+ session,
362
+ session_uri,
363
+ chunk_data,
364
+ bytes_uploaded,
365
+ file_size,
366
+ progress_bar,
367
+ )
368
+ bytes_uploaded += uploaded_bytes
369
+
370
+ if bytes_uploaded >= file_size:
371
+ break
372
+
373
+ logger.debug("Upload completed successfully")
374
+
375
+ except Exception as e:
376
+ logger.error(f"Async resumable upload error: {type(e).__name__}: {e}")
377
+ raise
378
+
379
+
380
+ def _upload_file_with_progress(
381
+ signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
382
+ ) -> None:
383
+ """Upload a file synchronously using google.resumable_media with signed URL initiation."""
384
+ _setup_upload_progress(file_path, file_size, progress_bar)
385
+ _log_upload_debug(signed_url)
386
+
387
+ try:
388
+ session = requests_lib.Session()
389
+ retry_strategy = Retry(
390
+ total=MAX_RETRIES,
391
+ backoff_factor=2,
392
+ status_forcelist=[429, 500, 502, 503, 504],
393
+ allowed_methods=["POST", "PUT", "PATCH"],
394
+ )
395
+ adapter = HTTPAdapter(max_retries=retry_strategy)
396
+ session.mount("http://", adapter)
397
+ session.mount("https://", adapter)
398
+
399
+ logger.debug("Initiating resumable upload session")
400
+ initiate_response = session.post(
401
+ signed_url, headers=INITIATE_HEADERS, timeout=30
402
+ )
403
+
404
+ if initiate_response.status_code not in {200, 201}:
405
+ logger.error(
406
+ f"Failed to initiate resumable session: {initiate_response.status_code}"
407
+ )
408
+ logger.error(f"Response: {initiate_response.text}")
409
+ initiate_response.raise_for_status()
410
+
411
+ session_uri = _validate_session_uri(initiate_response.headers.get("location"))
412
+
413
+ with open(file_path, "rb") as file_obj:
414
+ upload = resumable_requests.ResumableUpload(
415
+ upload_url=signed_url, chunk_size=CHUNK_SIZE
416
+ )
417
+
418
+ upload._resumable_url = session_uri
419
+ upload._stream = file_obj
420
+ upload._total_bytes = file_size
421
+
422
+ wrapped_file = ProgressWrapper(file_obj, progress_bar)
423
+ upload._stream = wrapped_file
424
+
425
+ while not upload.finished:
426
+ try:
427
+ upload.transmit_next_chunk(session)
428
+ except Exception as e:
429
+ logger.error(f"Chunk upload failed: {e}")
430
+ raise
431
+
432
+ logger.debug("Upload completed successfully using resumable_media library")
433
+
434
+ except Exception as e:
435
+ logger.error(f"Sync resumable upload error: {type(e).__name__}: {e}")
436
+ raise
437
+
438
+
439
+ class RestClientError(Exception):
440
+ """Base exception for REST client errors."""
441
+
442
+
443
+ class DataStorageError(RestClientError):
444
+ """Base exception for data storage operations."""
445
+
446
+
447
+ class DataStorageCreationError(DataStorageError):
448
+ """Raised when there's an error creating a data storage entry."""
449
+
450
+
451
+ class DataStorageRetrievalError(DataStorageError):
452
+ """Raised when there's an error retrieving a data storage entry."""
453
+
454
+
455
+ class ProgressWrapper:
456
+ """Common progress wrapper for file uploads."""
457
+
458
+ def __init__(self, file_obj, progress_bar):
459
+ self.file_obj = file_obj
460
+ self.progress_bar = progress_bar
461
+ self.bytes_read = 0
462
+
463
+ def read(self, size=-1):
464
+ data = self.file_obj.read(size)
465
+ if data:
466
+ self.bytes_read += len(data)
467
+ current_pos = self.file_obj.tell()
468
+ if current_pos > self.progress_bar.n:
469
+ self.progress_bar.update(current_pos - self.progress_bar.n)
470
+ return data
471
+
472
+ def seek(self, offset, whence=0):
473
+ return self.file_obj.seek(offset, whence)
474
+
475
+ def tell(self):
476
+ return self.file_obj.tell()
477
+
478
+
479
+ class DataStorageMethods:
480
+ """Data storage methods for RestClient.
481
+
482
+ This class contains methods for interacting with the data storage API endpoints.
483
+ """
484
+
485
+ # needed for mypy `NoReturn`
486
+ def _handle_http_errors(self, e: HTTPStatusError, operation: str) -> NoReturn:
487
+ """Handle common HTTP errors for data storage operations."""
488
+ if e.response.status_code == codes.FORBIDDEN:
489
+ raise DataStorageError(
490
+ f"Error {operation} data storage entry, not authorized"
491
+ ) from e
492
+ if e.response.status_code == codes.UNPROCESSABLE_ENTITY:
493
+ raise DataStorageError(f"Invalid request payload: {e.response.text}") from e
494
+ raise DataStorageError(
495
+ f"Error {operation} data storage entry: {e.response.status_code} - {e.response.text}"
496
+ ) from e
497
+
498
+ def _validate_file_path(self, file_path: str | Path) -> Path:
499
+ """Validate file path exists and return Path object."""
500
+ file_path = Path(file_path)
501
+ if not file_path.exists():
502
+ raise DataStorageError(f"File or directory not found: {file_path}")
503
+ return file_path
504
+
505
+ def _build_zip_path(self, name: str, path: str | None) -> str:
506
+ """Build GCS path for zip file."""
507
+ zip_filename = name if name.endswith(".zip") else f"{name}.zip"
508
+ if path:
509
+ return f"{path.rstrip('/')}/{zip_filename}"
510
+ return zip_filename
511
+
512
+ # TODO: methods in here need to be moved to fh tools
513
+ # =====================================
514
+ def _is_zip_file(self, file_path: Path) -> bool:
515
+ """Check if a file is a zip file by checking its magic bytes."""
516
+ try:
517
+ with open(file_path, "rb") as f:
518
+ magic = f.read(2)
519
+ return magic == b"PK"
520
+ except Exception:
521
+ return False
522
+
523
+ def _extract_zip_file(self, zip_path: Path, extract_to: Path) -> Path:
524
+ """Extract a zip file and return the path to the extracted content.
525
+
526
+ Args:
527
+ zip_path: Path to the zip file
528
+ extract_to: Directory to extract to
529
+
530
+ Returns:
531
+ Path to the extracted content (directory or single file)
532
+ """
533
+ extract_dir = extract_to / "extracted"
534
+ extract_dir.mkdir(exist_ok=True)
535
+
536
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
537
+ zip_ref.extractall(extract_dir)
538
+ extracted_items = list(extract_dir.iterdir())
539
+
540
+ if len(extracted_items) == 1:
541
+ return extracted_items[0]
542
+ return extract_dir
543
+
544
+ async def _adownload_from_gcs(
545
+ self, signed_url: str, file_name: str | None = None
546
+ ) -> Path:
547
+ """Download file from GCS using signed URL and handle unzipping if needed.
548
+
549
+ Args:
550
+ signed_url: The signed URL to download from
551
+ file_name: The name of the file to download
552
+
553
+ Returns:
554
+ Path to the downloaded file (or unzipped directory if it was a zip)
555
+ """
556
+ file_name = file_name or "downloaded_file"
557
+
558
+ try:
559
+ with tempfile.TemporaryDirectory() as temp_dir_str:
560
+ temp_dir = Path(temp_dir_str)
561
+ temp_file = temp_dir / file_name
562
+
563
+ async with self.async_client.stream("GET", signed_url) as response:
564
+ response.raise_for_status()
565
+
566
+ content_disposition = response.headers.get(
567
+ "content-disposition", ""
568
+ )
569
+ filename = file_name
570
+ if "filename=" in content_disposition:
571
+ filename = content_disposition.split("filename=")[-1].strip('"')
572
+
573
+ if filename != file_name:
574
+ temp_file = temp_dir / filename
575
+
576
+ async with aiofiles.open(temp_file, "wb") as f:
577
+ async for chunk in response.aiter_bytes(chunk_size=8192):
578
+ await f.write(chunk)
579
+
580
+ logger.debug(
581
+ f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
582
+ )
583
+
584
+ if self._is_zip_file(temp_file):
585
+ logger.debug(f"File {temp_file} is a zip file, extracting...")
586
+ extracted_path = self._extract_zip_file(temp_file, temp_dir)
587
+
588
+ final_temp_dir = Path(tempfile.mkdtemp())
589
+ final_path = final_temp_dir / extracted_path.name
590
+
591
+ if extracted_path.is_dir():
592
+ shutil.copytree(extracted_path, final_path)
593
+ else:
594
+ shutil.copy2(extracted_path, final_path)
595
+
596
+ return final_path
597
+ final_temp_dir = Path(tempfile.mkdtemp())
598
+ final_file = final_temp_dir / temp_file.name
599
+ shutil.copy2(temp_file, final_file)
600
+ return final_file
601
+
602
+ except Exception as e:
603
+ raise DataStorageError(f"Failed to download from GCS: {e}") from e
604
+
605
+ def _download_from_gcs(self, signed_url: str, file_name: str | None = None) -> Path:
606
+ """Download file from GCS using signed URL and handle unzipping if needed (sync version).
607
+
608
+ Args:
609
+ signed_url: The signed URL to download from
610
+ file_name: The name of the file to download
611
+ Returns:
612
+ Path to the downloaded file (or unzipped directory if it was a zip)
613
+ """
614
+ file_name = file_name or "downloaded_file"
615
+
616
+ try:
617
+ with tempfile.TemporaryDirectory() as temp_dir_str:
618
+ temp_dir = Path(temp_dir_str)
619
+ temp_file = temp_dir / file_name
620
+
621
+ with requests_lib.get(signed_url, stream=True, timeout=30) as response:
622
+ response.raise_for_status()
623
+
624
+ content_disposition = response.headers.get(
625
+ "content-disposition", ""
626
+ )
627
+ filename = file_name
628
+ if "filename=" in content_disposition:
629
+ filename = content_disposition.split("filename=")[-1].strip('"')
630
+
631
+ if filename != file_name:
632
+ temp_file = temp_dir / filename
633
+
634
+ with open(temp_file, "wb") as f:
635
+ for chunk in response.iter_content(chunk_size=8192):
636
+ f.write(chunk)
637
+
638
+ logger.debug(
639
+ f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
640
+ )
641
+
642
+ if self._is_zip_file(temp_file):
643
+ logger.debug(f"File {temp_file} is a zip file, extracting...")
644
+ extracted_path = self._extract_zip_file(temp_file, temp_dir)
645
+
646
+ final_temp_dir = Path(tempfile.mkdtemp())
647
+ final_path = final_temp_dir / extracted_path.name
648
+
649
+ if extracted_path.is_dir():
650
+ shutil.copytree(extracted_path, final_path)
651
+ else:
652
+ shutil.copy2(extracted_path, final_path)
653
+
654
+ return final_path
655
+ final_temp_dir = Path(tempfile.mkdtemp())
656
+ final_file = final_temp_dir / temp_file.name
657
+ shutil.copy2(temp_file, final_file)
658
+ return final_file
659
+
660
+ except Exception as e:
661
+ raise DataStorageError(f"Failed to download from GCS: {e}") from e
662
+
663
+ def _prepare_single_file_upload(
664
+ self, name: str, file_path: Path, description: str | None, path: str | None
665
+ ) -> tuple[int, DataStorageRequestPayload | None]:
666
+ """Prepare single file for upload, return file size and payload if text content."""
667
+ file_size = file_path.stat().st_size
668
+ logger.debug(
669
+ f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
670
+ )
671
+
672
+ if _should_send_as_text_content(file_path, file_size):
673
+ logger.debug(
674
+ f"Small text file ({file_size:,} bytes) - sending as text content"
675
+ )
676
+ text_content = _extract_text_from_file(file_path)
677
+ if text_content is not None:
678
+ return file_size, DataStorageRequestPayload(
679
+ name=name,
680
+ description=description,
681
+ content=text_content,
682
+ path=path,
683
+ is_collection=False,
684
+ )
685
+ logger.warning(
686
+ "Could not extract text content, falling back to file upload"
687
+ )
688
+
689
+ return file_size, None
690
+
691
+ def _create_data_storage_entry(
692
+ self, payload: DataStorageRequestPayload
693
+ ) -> DataStorageResponse:
694
+ """Create data storage entry via API (sync version)."""
695
+ response = self.client.post(
696
+ "/v0.1/data-storage/data-entries",
697
+ json=payload.model_dump(mode="json", exclude_none=True),
698
+ )
699
+ response.raise_for_status()
700
+ return DataStorageResponse.model_validate(response.json())
701
+
702
+ async def _acreate_data_storage_entry(
703
+ self, payload: DataStorageRequestPayload
704
+ ) -> DataStorageResponse:
705
+ """Create data storage entry via API (async version)."""
706
+ response = await self.async_client.post(
707
+ "/v0.1/data-storage/data-entries",
708
+ json=payload.model_dump(mode="json", exclude_none=True),
709
+ )
710
+ response.raise_for_status()
711
+ return DataStorageResponse.model_validate(response.json())
712
+
713
+ def _generate_folder_description_from_files(
714
+ self, dir_path: Path, manifest: DirectoryManifest
715
+ ) -> str:
716
+ """Generate folder description by concatenating descriptions of top-level files."""
717
+ descriptions = []
718
+
719
+ # Get top-level files only (not recursive)
720
+ for item in dir_path.iterdir():
721
+ if item.is_file():
722
+ # Try to get description from manifest first
723
+ file_desc = manifest.get_entry_description(item.name)
724
+
725
+ if file_desc:
726
+ descriptions.append(f"{item.name}: {file_desc}")
727
+ else:
728
+ descriptions.append(item.name)
729
+
730
+ if descriptions:
731
+ return f"Directory containing: {', '.join(descriptions)}"
732
+ return f"Directory: {dir_path.name}"
733
+
734
+ def _load_manifest(
735
+ self, dir_path: Path, manifest_filename: str | None
736
+ ) -> DirectoryManifest:
737
+ """Load and parse a manifest file (JSON or YAML) into a structured model."""
738
+ if not manifest_filename:
739
+ return DirectoryManifest()
740
+
741
+ manifest_path = dir_path / manifest_filename
742
+ if not manifest_path.exists():
743
+ logger.error(f"Manifest file not found at {manifest_path}")
744
+ raise DataStorageCreationError(
745
+ f"Manifest file {manifest_filename} not found in directory {dir_path}. Ensure the manifest exists and is correctly named, or do not pass it as an argument."
746
+ )
747
+
748
+ try:
749
+ with open(manifest_path, encoding="utf-8") as f:
750
+ data = {}
751
+ if manifest_filename.lower().endswith(".json"):
752
+ data = json.load(f)
753
+ elif manifest_filename.lower().endswith((".yaml", ".yml")):
754
+ if yaml is None:
755
+ raise ImportError(
756
+ "pyyaml is required to parse .yaml manifest files. "
757
+ "Please install it with `pip install pyyaml`."
758
+ )
759
+ data = yaml.safe_load(f)
760
+ else:
761
+ logger.warning(
762
+ f"Unsupported manifest file extension: {manifest_filename}"
763
+ )
764
+ return DirectoryManifest()
765
+
766
+ return DirectoryManifest.from_dict(data or {})
767
+
768
+ except Exception as e:
769
+ logger.warning(f"Failed to load manifest {manifest_filename}: {e}")
770
+
771
+ return DirectoryManifest()
772
+
773
+ def _upload_data_directory(
774
+ self,
775
+ name: str,
776
+ dir_path: Path,
777
+ description: str | None,
778
+ path: str | None = None,
779
+ ignore_patterns: list[str] | None = None,
780
+ ignore_filename: str = ".gitignore",
781
+ project_id: UUID | None = None,
782
+ ) -> DataStorageResponse:
783
+ """Upload a directory as a single zip file collection.
784
+
785
+ Args:
786
+ name: Name for the directory collection
787
+ dir_path: Path to directory to zip and upload
788
+ description: Description for the collection
789
+ path: Optional GCS path for the zip file
790
+ ignore_patterns: List of patterns to ignore when zipping
791
+ ignore_filename: Name of ignore file to read from directory
792
+ project_id: ID of the project this data storage entry belongs to
793
+
794
+ Returns:
795
+ DataStorageResponse for the uploaded zip file
796
+ """
797
+ logger.debug(f"Uploading directory as zip: {dir_path}")
798
+
799
+ with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
800
+ temp_zip_path = Path(temp_file.name)
801
+
802
+ zip_size = _create_directory_zip(
803
+ dir_path, temp_zip_path, ignore_patterns, ignore_filename
804
+ )
805
+
806
+ zip_gcs_path = self._build_zip_path(name, path)
807
+ payload = DataStorageRequestPayload(
808
+ name=name,
809
+ description=description,
810
+ path=zip_gcs_path,
811
+ is_collection=True,
812
+ project_id=project_id,
813
+ )
814
+
815
+ logger.debug(
816
+ f"Creating data storage entry for zip: {payload.model_dump(exclude_none=True)}"
817
+ )
818
+ data_storage_response = self._create_data_storage_entry(payload)
819
+
820
+ for storage_location in data_storage_response.storage_locations:
821
+ if not storage_location.storage_config.signed_url:
822
+ raise DataStorageCreationError(
823
+ "No signed URL returned for zip upload"
824
+ )
825
+
826
+ with tqdm(
827
+ total=zip_size,
828
+ unit="B",
829
+ unit_scale=True,
830
+ unit_divisor=1024,
831
+ desc=f"Uploading {dir_path.name} (zipped)",
832
+ miniters=1,
833
+ mininterval=0.1,
834
+ ) as pbar:
835
+ _upload_file_with_progress(
836
+ storage_location.storage_config.signed_url,
837
+ temp_zip_path,
838
+ pbar,
839
+ zip_size,
840
+ )
841
+
842
+ status_response = self.client.patch(
843
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
844
+ json={"status": "active"},
845
+ )
846
+ status_response.raise_for_status()
847
+
848
+ logger.debug(
849
+ f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
850
+ )
851
+ return DataStorageResponse.model_validate(status_response.json())
852
+
853
+ async def _aupload_data_directory(
854
+ self,
855
+ name: str,
856
+ dir_path: Path,
857
+ description: str | None,
858
+ path: str | None = None,
859
+ ignore_patterns: list[str] | None = None,
860
+ ignore_filename: str = ".gitignore",
861
+ project_id: UUID | None = None,
862
+ ) -> DataStorageResponse:
863
+ """Asynchronously upload a directory as a single zip file.
864
+
865
+ Args:
866
+ name: Name for the directory collection
867
+ dir_path: Path to directory to zip and upload
868
+ description: Description for the collection
869
+ path: Optional GCS path for the zip file
870
+ ignore_patterns: List of patterns to ignore when zipping
871
+ ignore_filename: Name of ignore file to read from directory
872
+ project_id: ID of the project this data storage entry belongs to
873
+
874
+ Returns:
875
+ DataStorageResponse for the uploaded zip file
876
+ """
877
+ logger.debug(f"Async uploading directory as zip: {dir_path}")
878
+
879
+ with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
880
+ temp_zip_path = Path(temp_file.name)
881
+
882
+ zip_size = _create_directory_zip(
883
+ dir_path, temp_zip_path, ignore_patterns, ignore_filename
884
+ )
885
+
886
+ zip_gcs_path = self._build_zip_path(name, path)
887
+ payload = DataStorageRequestPayload(
888
+ name=name,
889
+ description=description,
890
+ path=zip_gcs_path,
891
+ is_collection=True,
892
+ project_id=project_id,
893
+ )
894
+
895
+ data_storage_response = await self._acreate_data_storage_entry(payload)
896
+
897
+ for storage_location in data_storage_response.storage_locations:
898
+ if not storage_location.storage_config.signed_url:
899
+ raise DataStorageCreationError(
900
+ "No signed URL returned for zip upload"
901
+ )
902
+
903
+ with tqdm(
904
+ total=zip_size,
905
+ unit="B",
906
+ unit_scale=True,
907
+ unit_divisor=1024,
908
+ desc=f"Uploading {dir_path.name} (zipped)",
909
+ miniters=1,
910
+ mininterval=0.1,
911
+ ) as pbar:
912
+ await _aupload_file_with_progress(
913
+ storage_location.storage_config.signed_url,
914
+ temp_zip_path,
915
+ pbar,
916
+ zip_size,
917
+ )
918
+
919
+ status_response = await self.async_client.patch(
920
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
921
+ json={"status": "active"},
922
+ )
923
+ status_response.raise_for_status()
924
+
925
+ logger.debug(
926
+ f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
927
+ )
928
+ return DataStorageResponse.model_validate(status_response.json())
929
+
930
+ def _upload_data_single_file(
931
+ self,
932
+ name: str,
933
+ file_path: Path,
934
+ description: str | None,
935
+ path: str | None = None,
936
+ project_id: UUID | None = None,
937
+ ) -> DataStorageResponse:
938
+ """Upload a single file."""
939
+ file_size = file_path.stat().st_size
940
+ logger.debug(
941
+ f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
942
+ )
943
+
944
+ if _should_send_as_text_content(file_path, file_size):
945
+ logger.debug(
946
+ f"Small text file ({file_size:,} bytes) - sending as text content"
947
+ )
948
+
949
+ text_content = _extract_text_from_file(file_path)
950
+ if text_content is not None:
951
+ payload = DataStorageRequestPayload(
952
+ name=name,
953
+ description=description,
954
+ content=text_content,
955
+ path=path,
956
+ is_collection=False,
957
+ project_id=project_id,
958
+ )
959
+
960
+ logger.debug("Sending file as text content")
961
+ return self._create_data_storage_entry(payload)
962
+ logger.warning(
963
+ "Could not extract text content, falling back to file upload"
964
+ )
965
+
966
+ logger.debug(
967
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
968
+ )
969
+ payload = DataStorageRequestPayload(
970
+ name=name,
971
+ description=description,
972
+ path=path,
973
+ is_collection=False,
974
+ project_id=project_id,
975
+ )
976
+
977
+ logger.debug(
978
+ f"Requesting signed URL with payload: {payload.model_dump(exclude_none=True)}"
979
+ )
980
+
981
+ data_storage_response = self._create_data_storage_entry(payload)
982
+
983
+ for storage_location in data_storage_response.storage_locations:
984
+ if not storage_location.storage_config.signed_url:
985
+ raise DataStorageCreationError("No signed URL returned from server")
986
+
987
+ with tqdm(
988
+ total=file_size,
989
+ unit="B",
990
+ unit_scale=True,
991
+ unit_divisor=1024,
992
+ desc=f"Uploading {file_path.name}",
993
+ miniters=1,
994
+ mininterval=0.1,
995
+ ) as pbar:
996
+ try:
997
+ _upload_file_with_progress(
998
+ storage_location.storage_config.signed_url,
999
+ file_path,
1000
+ pbar,
1001
+ file_size,
1002
+ )
1003
+ logger.debug("File upload to signed URL completed successfully")
1004
+ except Exception as e:
1005
+ logger.error(f"Failed to upload file to signed URL: {e}")
1006
+ raise
1007
+
1008
+ logger.debug("Updating data storage status to active")
1009
+ status_response = self.client.patch(
1010
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1011
+ json={"status": "active"},
1012
+ )
1013
+ status_response.raise_for_status()
1014
+ logger.debug("Data storage status updated successfully")
1015
+
1016
+ return DataStorageResponse.model_validate(status_response.json())
1017
+
1018
+ async def _aupload_data_single_file(
1019
+ self,
1020
+ name: str,
1021
+ file_path: Path,
1022
+ description: str | None,
1023
+ path: str | None = None,
1024
+ dataset_id: UUID | None = None,
1025
+ project_id: UUID | None = None,
1026
+ ) -> DataStorageResponse:
1027
+ """Asynchronously upload a single file."""
1028
+ file_size, text_payload = self._prepare_single_file_upload(
1029
+ name, file_path, description, path
1030
+ )
1031
+
1032
+ if text_payload:
1033
+ logger.debug("Sending file as text content")
1034
+ text_payload.dataset_id = dataset_id
1035
+ return await self._acreate_data_storage_entry(text_payload)
1036
+
1037
+ logger.debug(
1038
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1039
+ )
1040
+ payload = DataStorageRequestPayload(
1041
+ name=name,
1042
+ description=description,
1043
+ path=path,
1044
+ is_collection=False,
1045
+ dataset_id=dataset_id,
1046
+ project_id=project_id,
1047
+ )
1048
+
1049
+ data_storage_response = await self._acreate_data_storage_entry(payload)
1050
+
1051
+ for location in data_storage_response.storage_locations:
1052
+ if not location.storage_config.signed_url:
1053
+ raise DataStorageCreationError(
1054
+ f"No signed URL returned from server for location: {location.id}"
1055
+ )
1056
+
1057
+ with tqdm(
1058
+ total=file_size,
1059
+ unit="B",
1060
+ unit_scale=True,
1061
+ unit_divisor=1024,
1062
+ desc=f"Uploading {file_path.name}",
1063
+ miniters=1,
1064
+ mininterval=0.1,
1065
+ leave=False,
1066
+ ) as pbar:
1067
+ await _aupload_file_with_progress(
1068
+ location.storage_config.signed_url, file_path, pbar, file_size
1069
+ )
1070
+
1071
+ status_response = await self.async_client.patch(
1072
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1073
+ json={"status": "active"},
1074
+ )
1075
+ status_response.raise_for_status()
1076
+
1077
+ return DataStorageResponse.model_validate(status_response.json())
1078
+
1079
+ def _upload_data_single_file_with_parent(
1080
+ self,
1081
+ name: str,
1082
+ file_path: Path,
1083
+ description: str | None,
1084
+ path: str | None,
1085
+ parent_id: UUID | None,
1086
+ dataset_id: UUID | None = None,
1087
+ project_id: UUID | None = None,
1088
+ ) -> DataStorageResponse:
1089
+ """Upload a single file with a parent ID (sync version)."""
1090
+ file_size, text_payload = self._prepare_single_file_upload(
1091
+ name, file_path, description, path
1092
+ )
1093
+
1094
+ if text_payload:
1095
+ logger.debug("Sending file as text content with parent_id")
1096
+ text_payload.parent_id = parent_id
1097
+ text_payload.dataset_id = dataset_id
1098
+ text_payload.project_id = project_id
1099
+ return self._create_data_storage_entry(text_payload)
1100
+
1101
+ logger.debug(
1102
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1103
+ )
1104
+ payload = DataStorageRequestPayload(
1105
+ name=name,
1106
+ description=description,
1107
+ path=path,
1108
+ is_collection=False,
1109
+ parent_id=parent_id,
1110
+ dataset_id=dataset_id,
1111
+ project_id=project_id,
1112
+ )
1113
+ data_storage_response = self._create_data_storage_entry(payload)
1114
+
1115
+ for location in data_storage_response.storage_locations:
1116
+ if not location.storage_config.signed_url:
1117
+ raise DataStorageCreationError("No signed URL returned from server")
1118
+
1119
+ with tqdm(
1120
+ total=file_size,
1121
+ unit="B",
1122
+ unit_scale=True,
1123
+ unit_divisor=1024,
1124
+ desc=f"Uploading {file_path.name}",
1125
+ miniters=1,
1126
+ mininterval=0.1,
1127
+ leave=False,
1128
+ ) as pbar:
1129
+ _upload_file_with_progress(
1130
+ location.storage_config.signed_url, file_path, pbar, file_size
1131
+ )
1132
+
1133
+ status_response = self.client.patch(
1134
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1135
+ json={"status": "active"},
1136
+ )
1137
+ status_response.raise_for_status()
1138
+
1139
+ return DataStorageResponse.model_validate(status_response.json())
1140
+
1141
+ def _process_file_item(
1142
+ self,
1143
+ item: Path,
1144
+ dir_manifest: DirectoryManifest,
1145
+ current_parent_id: UUID,
1146
+ dataset_id: UUID | None = None,
1147
+ project_id: UUID | None = None,
1148
+ ) -> DataStorageResponse | None:
1149
+ """Process a single file item for upload."""
1150
+ try:
1151
+ manifest_desc = dir_manifest.get_entry_description(item.name)
1152
+ file_description = manifest_desc or f"File: {item.name}"
1153
+
1154
+ logger.debug(
1155
+ f"Processing file {item.name} with description: '{file_description}'"
1156
+ )
1157
+
1158
+ return self._upload_data_single_file_with_parent(
1159
+ name=item.name,
1160
+ file_path=item,
1161
+ description=file_description,
1162
+ path=None,
1163
+ parent_id=current_parent_id,
1164
+ dataset_id=dataset_id,
1165
+ project_id=project_id,
1166
+ )
1167
+ except Exception as e:
1168
+ logger.error(f"Failed to upload file {item}: {e}")
1169
+ return None
1170
+
1171
+ def _upload_directory_hierarchically(
1172
+ self,
1173
+ name: str,
1174
+ dir_path: Path,
1175
+ description: str | None = None,
1176
+ manifest_filename: str | None = None,
1177
+ parent_id: UUID | None = None,
1178
+ ignore_patterns: list[str] | None = None,
1179
+ ignore_filename: str = ".gitignore",
1180
+ base_dir: Path | None = None,
1181
+ dir_manifest: DirectoryManifest | None = None,
1182
+ dataset_id: UUID | None = None,
1183
+ project_id: UUID | None = None,
1184
+ ) -> list[DataStorageResponse]:
1185
+ """Upload a directory with single dataset and individual file storage entries."""
1186
+ responses = []
1187
+ if parent_id is None:
1188
+ base_dir = dir_path
1189
+ all_ignore_patterns = _collect_ignore_patterns(
1190
+ base_dir, ignore_patterns, ignore_filename
1191
+ )
1192
+
1193
+ payload = DataStorageRequestPayload(
1194
+ name=name,
1195
+ description=description,
1196
+ parent_id=None,
1197
+ dataset_id=None,
1198
+ is_collection=False,
1199
+ project_id=project_id,
1200
+ )
1201
+
1202
+ dir_response = self._create_data_storage_entry(payload)
1203
+ responses.append(dir_response)
1204
+ current_parent_id = dir_response.data_storage.id
1205
+ current_dataset_id = dir_response.data_storage.dataset_id
1206
+
1207
+ dir_manifest = self._load_directory_manifest(
1208
+ manifest_filename, parent_id, dir_path
1209
+ )
1210
+ else:
1211
+ all_ignore_patterns = ignore_patterns or []
1212
+ current_parent_id = parent_id
1213
+ current_dataset_id = dataset_id
1214
+
1215
+ for item in dir_path.iterdir():
1216
+ if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
1217
+ continue
1218
+
1219
+ if item.is_dir():
1220
+ subdir_manifest = DirectoryManifest()
1221
+ if dir_manifest:
1222
+ entry = dir_manifest.entries.get(item.name)
1223
+ if isinstance(entry, DirectoryManifest):
1224
+ subdir_manifest = entry
1225
+ elif isinstance(entry, ManifestEntry):
1226
+ # Convert single entry to manifest
1227
+ subdir_manifest = DirectoryManifest(entries={item.name: entry})
1228
+
1229
+ subdir_description = subdir_manifest.get_entry_description(item.name)
1230
+ if not subdir_description:
1231
+ subdir_description = self._generate_folder_description_from_files(
1232
+ item, subdir_manifest
1233
+ )
1234
+
1235
+ subdir_payload = DataStorageRequestPayload(
1236
+ name=item.name,
1237
+ description=subdir_description,
1238
+ parent_id=current_parent_id,
1239
+ dataset_id=current_dataset_id,
1240
+ is_collection=False,
1241
+ project_id=project_id,
1242
+ )
1243
+ subdir_response = self._create_data_storage_entry(subdir_payload)
1244
+ responses.append(subdir_response)
1245
+
1246
+ subdir_responses = self._upload_directory_hierarchically(
1247
+ name=item.name,
1248
+ dir_path=item,
1249
+ description=None,
1250
+ manifest_filename=None,
1251
+ parent_id=subdir_response.data_storage.id,
1252
+ ignore_patterns=all_ignore_patterns,
1253
+ ignore_filename=ignore_filename,
1254
+ base_dir=base_dir,
1255
+ dir_manifest=subdir_manifest,
1256
+ dataset_id=current_dataset_id,
1257
+ project_id=project_id,
1258
+ )
1259
+ responses.extend(subdir_responses)
1260
+ elif item.is_file():
1261
+ file_response = self._process_file_item(
1262
+ item,
1263
+ dir_manifest or DirectoryManifest(),
1264
+ current_parent_id,
1265
+ current_dataset_id,
1266
+ )
1267
+ if file_response:
1268
+ responses.append(file_response)
1269
+
1270
+ return responses
1271
+
1272
+ def _load_directory_manifest(
1273
+ self,
1274
+ manifest_filename: str | None,
1275
+ parent_id: UUID | None,
1276
+ dir_path: Path,
1277
+ ) -> DirectoryManifest:
1278
+ """Load directory manifest if available."""
1279
+ if manifest_filename and not parent_id:
1280
+ manifest_data = self._load_manifest(Path.cwd(), manifest_filename)
1281
+ dir_name = dir_path.name
1282
+ logger.debug(
1283
+ f"Loaded manifest entries: {list(manifest_data.entries.keys())}"
1284
+ )
1285
+ logger.debug(
1286
+ f"Looking for manifest entry with directory name: '{dir_name}'"
1287
+ )
1288
+
1289
+ entry = manifest_data.entries.get(dir_name)
1290
+ if isinstance(entry, DirectoryManifest):
1291
+ return entry
1292
+ if isinstance(entry, ManifestEntry):
1293
+ return DirectoryManifest(entries={dir_name: entry})
1294
+ logger.debug(
1295
+ f"No manifest entry found for '{dir_name}', available keys: {list(manifest_data.entries.keys())}"
1296
+ )
1297
+ return DirectoryManifest()
1298
+ return DirectoryManifest()
1299
+
1300
+ async def _aupload_data_single_file_with_parent(
1301
+ self,
1302
+ name: str,
1303
+ file_path: Path,
1304
+ description: str | None,
1305
+ path: str | None,
1306
+ parent_id: UUID | None,
1307
+ dataset_id: UUID | None = None,
1308
+ project_id: UUID | None = None,
1309
+ ) -> DataStorageResponse:
1310
+ """Asynchronously upload a single file with a parent ID."""
1311
+ file_size, text_payload = self._prepare_single_file_upload(
1312
+ name, file_path, description, path
1313
+ )
1314
+
1315
+ if text_payload:
1316
+ logger.debug("Sending file as text content with parent_id")
1317
+ text_payload.parent_id = parent_id
1318
+ text_payload.dataset_id = dataset_id
1319
+ text_payload.project_id = project_id
1320
+ return await self._acreate_data_storage_entry(text_payload)
1321
+
1322
+ logger.debug(
1323
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1324
+ )
1325
+ payload = DataStorageRequestPayload(
1326
+ name=name,
1327
+ description=description,
1328
+ path=path,
1329
+ is_collection=False,
1330
+ parent_id=parent_id,
1331
+ dataset_id=dataset_id,
1332
+ project_id=project_id,
1333
+ )
1334
+ data_storage_response = await self._acreate_data_storage_entry(payload)
1335
+
1336
+ storage_location = data_storage_response.storage_locations[0]
1337
+
1338
+ if not storage_location.storage_config.signed_url:
1339
+ raise DataStorageCreationError("No signed URL returned from server")
1340
+
1341
+ with tqdm(
1342
+ total=file_size,
1343
+ unit="B",
1344
+ unit_scale=True,
1345
+ unit_divisor=1024,
1346
+ desc=f"Uploading {file_path.name}",
1347
+ miniters=1,
1348
+ mininterval=0.1,
1349
+ ) as pbar:
1350
+ await _aupload_file_with_progress(
1351
+ storage_location.storage_config.signed_url, file_path, pbar, file_size
1352
+ )
1353
+
1354
+ status_response = await self.async_client.patch(
1355
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1356
+ json={"status": "active"},
1357
+ )
1358
+ status_response.raise_for_status()
1359
+
1360
+ return DataStorageResponse.model_validate(status_response.json())
1361
+
1362
+ async def _aprocess_file_item(
1363
+ self,
1364
+ item: Path,
1365
+ dir_manifest: DirectoryManifest,
1366
+ current_parent_id: UUID,
1367
+ dataset_id: UUID | None = None,
1368
+ project_id: UUID | None = None,
1369
+ ) -> DataStorageResponse | None:
1370
+ """Asynchronously process a single file item for upload."""
1371
+ try:
1372
+ manifest_desc = dir_manifest.get_entry_description(item.name)
1373
+ file_description = manifest_desc or f"File: {item.name}"
1374
+
1375
+ logger.debug(
1376
+ f"Processing file {item.name} with description: '{file_description}'"
1377
+ )
1378
+
1379
+ return await self._aupload_data_single_file_with_parent(
1380
+ name=item.name,
1381
+ file_path=item,
1382
+ description=file_description,
1383
+ path=None,
1384
+ parent_id=current_parent_id,
1385
+ dataset_id=dataset_id,
1386
+ project_id=project_id,
1387
+ )
1388
+ except Exception as e:
1389
+ logger.error(f"Failed to upload file {item}: {e}")
1390
+ return None
1391
+
1392
+ async def _aupload_directory_hierarchically(
1393
+ self,
1394
+ name: str,
1395
+ dir_path: Path,
1396
+ description: str | None = None,
1397
+ manifest_filename: str | None = None,
1398
+ parent_id: UUID | None = None,
1399
+ ignore_patterns: list[str] | None = None,
1400
+ ignore_filename: str = ".gitignore",
1401
+ base_dir: Path | None = None,
1402
+ dir_manifest: DirectoryManifest | None = None,
1403
+ dataset_id: UUID | None = None,
1404
+ project_id: UUID | None = None,
1405
+ ) -> list[DataStorageResponse]:
1406
+ """Upload a directory with single dataset and individual file storage entries (async)."""
1407
+ responses = []
1408
+
1409
+ if parent_id is None:
1410
+ base_dir = dir_path
1411
+ all_ignore_patterns = _collect_ignore_patterns(
1412
+ base_dir, ignore_patterns, ignore_filename
1413
+ )
1414
+
1415
+ payload = DataStorageRequestPayload(
1416
+ name=name,
1417
+ description=description,
1418
+ parent_id=None,
1419
+ dataset_id=None,
1420
+ is_collection=False,
1421
+ project_id=project_id,
1422
+ )
1423
+
1424
+ dir_response = await self._acreate_data_storage_entry(payload)
1425
+ responses.append(dir_response)
1426
+ current_parent_id = dir_response.data_storage.id
1427
+ current_dataset_id = dir_response.data_storage.dataset_id
1428
+
1429
+ dir_manifest = self._load_directory_manifest(
1430
+ manifest_filename, parent_id, dir_path
1431
+ )
1432
+ else:
1433
+ all_ignore_patterns = ignore_patterns or []
1434
+ current_parent_id = parent_id
1435
+ current_dataset_id = dataset_id
1436
+
1437
+ for item in dir_path.iterdir():
1438
+ if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
1439
+ continue
1440
+
1441
+ if item.is_dir():
1442
+ subdir_manifest = DirectoryManifest()
1443
+ if dir_manifest:
1444
+ entry = dir_manifest.entries.get(item.name)
1445
+ if isinstance(entry, DirectoryManifest):
1446
+ subdir_manifest = entry
1447
+ elif isinstance(entry, ManifestEntry):
1448
+ subdir_manifest = DirectoryManifest(entries={item.name: entry})
1449
+
1450
+ subdir_description = subdir_manifest.get_entry_description(item.name)
1451
+ if not subdir_description:
1452
+ subdir_description = self._generate_folder_description_from_files(
1453
+ item, subdir_manifest
1454
+ )
1455
+
1456
+ subdir_payload = DataStorageRequestPayload(
1457
+ name=item.name,
1458
+ description=subdir_description,
1459
+ parent_id=current_parent_id,
1460
+ dataset_id=current_dataset_id,
1461
+ is_collection=False,
1462
+ project_id=project_id,
1463
+ )
1464
+ subdir_response = await self._acreate_data_storage_entry(subdir_payload)
1465
+ responses.append(subdir_response)
1466
+
1467
+ subdir_responses = await self._aupload_directory_hierarchically(
1468
+ name=item.name,
1469
+ dir_path=item,
1470
+ description=None,
1471
+ manifest_filename=None,
1472
+ parent_id=subdir_response.data_storage.id,
1473
+ ignore_patterns=all_ignore_patterns,
1474
+ ignore_filename=ignore_filename,
1475
+ base_dir=base_dir,
1476
+ dir_manifest=subdir_manifest,
1477
+ dataset_id=current_dataset_id,
1478
+ project_id=project_id,
1479
+ )
1480
+ responses.extend(subdir_responses)
1481
+ elif item.is_file():
1482
+ file_response = await self._aprocess_file_item(
1483
+ item,
1484
+ dir_manifest or DirectoryManifest(),
1485
+ current_parent_id,
1486
+ current_dataset_id,
1487
+ )
1488
+ if file_response:
1489
+ responses.append(file_response)
1490
+
1491
+ return responses
1492
+
1493
+ @property
1494
+ def client(self) -> Client:
1495
+ raise NotImplementedError("client property must be implemented by subclass")
1496
+
1497
+ @property
1498
+ def async_client(self) -> AsyncClient:
1499
+ raise NotImplementedError(
1500
+ "async_client property must be implemented by subclass"
1501
+ )
1502
+
1503
+ @retry(
1504
+ stop=stop_after_attempt(3),
1505
+ wait=wait_exponential(multiplier=1, max=10),
1506
+ retry=retry_if_connection_error,
1507
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1508
+ )
1509
+ def store_text_content(
1510
+ self,
1511
+ name: str,
1512
+ content: str,
1513
+ description: str | None = None,
1514
+ path: str | None = None,
1515
+ project_id: UUID | None = None,
1516
+ ) -> DataStorageResponse:
1517
+ """Store content as a string in the data storage system.
1518
+
1519
+ Args:
1520
+ name: Name of the data storage entry
1521
+ content: Content to store as a string
1522
+ description: Optional description of the data storage entry
1523
+ path: Optional path for the data storage entry
1524
+ project_id: ID of the project this data storage entry belongs to
1525
+
1526
+ Returns:
1527
+ DataStorageResponse containing the created data storage entry and storage locations
1528
+
1529
+ Raises:
1530
+ DataStorageCreationError: If there's an error creating the data storage entry
1531
+ """
1532
+ try:
1533
+ payload = DataStorageRequestPayload(
1534
+ name=name,
1535
+ content=content,
1536
+ description=description,
1537
+ path=path,
1538
+ project_id=project_id,
1539
+ )
1540
+ return self._create_data_storage_entry(payload)
1541
+ except HTTPStatusError as e:
1542
+ self._handle_http_errors(e, "creating")
1543
+ except Exception as e:
1544
+ raise DataStorageCreationError(
1545
+ f"An unexpected error occurred: {e!r}"
1546
+ ) from e
1547
+
1548
+ @retry(
1549
+ stop=stop_after_attempt(3),
1550
+ wait=wait_exponential(multiplier=1, max=10),
1551
+ retry=retry_if_connection_error,
1552
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1553
+ )
1554
+ async def astore_text_content(
1555
+ self,
1556
+ name: str,
1557
+ content: str,
1558
+ description: str | None = None,
1559
+ path: str | None = None,
1560
+ dataset_id: UUID | None = None,
1561
+ project_id: UUID | None = None,
1562
+ ) -> DataStorageResponse:
1563
+ """Asynchronously store content as a string in the data storage system.
1564
+
1565
+ Args:
1566
+ name: Name of the data storage entry
1567
+ content: Content to store as a string
1568
+ description: Optional description of the data storage entry
1569
+ path: Optional path for the data storage entry
1570
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1571
+ project_id: ID of the project this data storage entry belongs to
1572
+
1573
+ Returns:
1574
+ DataStorageResponse containing the created data storage entry and storage locations
1575
+
1576
+ Raises:
1577
+ DataStorageCreationError: If there's an error creating the data storage entry
1578
+ """
1579
+ try:
1580
+ payload = DataStorageRequestPayload(
1581
+ name=name,
1582
+ content=content,
1583
+ description=description,
1584
+ path=path,
1585
+ dataset_id=dataset_id,
1586
+ project_id=project_id,
1587
+ )
1588
+ return await self._acreate_data_storage_entry(payload)
1589
+ except HTTPStatusError as e:
1590
+ self._handle_http_errors(e, "creating")
1591
+ except Exception as e:
1592
+ raise DataStorageCreationError(
1593
+ f"An unexpected error occurred: {e!r}"
1594
+ ) from e
1595
+
1596
+ @retry(
1597
+ stop=stop_after_attempt(3),
1598
+ wait=wait_exponential(multiplier=1, max=10),
1599
+ retry=retry_if_connection_error,
1600
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1601
+ )
1602
+ async def astore_link(
1603
+ self,
1604
+ name: str,
1605
+ url: HttpUrl,
1606
+ description: str,
1607
+ instructions: str,
1608
+ api_key: str | None = None,
1609
+ metadata: dict[str, Any] | None = None,
1610
+ dataset_id: UUID | None = None,
1611
+ project_id: UUID | None = None,
1612
+ ) -> DataStorageResponse:
1613
+ """Asynchronously store a link/URL in the data storage system.
1614
+
1615
+ Args:
1616
+ name: Name of the link entry
1617
+ url: The URL/link to store
1618
+ description: Searchable details of the link
1619
+ instructions: Instructions for how to consume the link or api
1620
+ api_key: Any authentication key to access the api. If this is included, you should also include
1621
+ details of how the key should be consumed in the instructions.
1622
+ metadata: Any additional metadata about the link
1623
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1624
+ project_id: ID of the project this data storage entry belongs to
1625
+
1626
+ Returns:
1627
+ DataStorageResponse containing the created link storage entry
1628
+
1629
+ Raises:
1630
+ DataStorageCreationError: If there's an error creating the link storage entry
1631
+ """
1632
+ try:
1633
+ link_metadata = metadata.copy() if metadata else {}
1634
+ link_metadata["instructions"] = instructions
1635
+ if api_key:
1636
+ link_metadata["api_key"] = api_key
1637
+
1638
+ existing_location = DataStorageLocationPayload(
1639
+ storage_type=DataStorageType.LINK,
1640
+ content_type=DataContentType.TEXT,
1641
+ location=str(url),
1642
+ metadata=link_metadata or None,
1643
+ )
1644
+
1645
+ payload = DataStorageRequestPayload(
1646
+ name=name,
1647
+ content=str(url),
1648
+ description=description,
1649
+ dataset_id=dataset_id,
1650
+ project_id=project_id,
1651
+ existing_location=existing_location,
1652
+ )
1653
+ return await self._acreate_data_storage_entry(payload)
1654
+ except HTTPStatusError as e:
1655
+ self._handle_http_errors(e, "creating")
1656
+ except Exception as e:
1657
+ raise DataStorageCreationError(
1658
+ f"An unexpected error occurred: {e!r}"
1659
+ ) from e
1660
+
1661
+ def store_link(
1662
+ self,
1663
+ name: str,
1664
+ url: HttpUrl,
1665
+ description: str,
1666
+ instructions: str,
1667
+ api_key: str | None = None,
1668
+ metadata: dict[str, Any] | None = None,
1669
+ dataset_id: UUID | None = None,
1670
+ project_id: UUID | None = None,
1671
+ ) -> DataStorageResponse:
1672
+ """Store a link/URL in the data storage system.
1673
+
1674
+ Args:
1675
+ name: Name of the link entry
1676
+ url: The URL/link to store
1677
+ description: Searchable details of the link
1678
+ instructions: Instructions for how to consume the link or api
1679
+ api_key: Any authentication key to access the api. If this is included, you should also include
1680
+ details of how the key should be consumed in the instructions.
1681
+ metadata: Any additional metadata about the link
1682
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1683
+ project_id: ID of the project this data storage entry belongs to
1684
+
1685
+ Returns:
1686
+ DataStorageResponse containing the created link storage entry
1687
+
1688
+ Raises:
1689
+ DataStorageCreationError: If there's an error creating the link storage entry
1690
+ """
1691
+ try:
1692
+ link_metadata = metadata.copy() if metadata else {}
1693
+ link_metadata["instructions"] = instructions
1694
+ if api_key:
1695
+ link_metadata["api_key"] = api_key
1696
+
1697
+ existing_location = DataStorageLocationPayload(
1698
+ storage_type=DataStorageType.LINK,
1699
+ content_type=DataContentType.TEXT,
1700
+ location=str(url),
1701
+ metadata=link_metadata or None,
1702
+ )
1703
+
1704
+ payload = DataStorageRequestPayload(
1705
+ name=name,
1706
+ content=str(url),
1707
+ description=description,
1708
+ dataset_id=dataset_id,
1709
+ project_id=project_id,
1710
+ existing_location=existing_location,
1711
+ )
1712
+ return self._create_data_storage_entry(payload)
1713
+ except HTTPStatusError as e:
1714
+ self._handle_http_errors(e, "creating")
1715
+ except Exception as e:
1716
+ raise DataStorageCreationError(
1717
+ f"An unexpected error occurred: {e!r}"
1718
+ ) from e
1719
+
1720
+ @retry(
1721
+ stop=stop_after_attempt(3),
1722
+ wait=wait_exponential(multiplier=1, max=10),
1723
+ retry=retry_if_connection_error,
1724
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1725
+ )
1726
+ def store_file_content(
1727
+ self,
1728
+ name: str,
1729
+ file_path: str | Path,
1730
+ description: str | None = None,
1731
+ path: str | None = None,
1732
+ as_collection: bool = False,
1733
+ manifest_filename: str | None = None,
1734
+ ignore_patterns: list[str] | None = None,
1735
+ ignore_filename: str = ".gitignore",
1736
+ project_id: UUID | None = None,
1737
+ ) -> DataStorageResponse:
1738
+ """Store file or directory content in the data storage system.
1739
+
1740
+ For files: Small text files (< 10MB, supported formats) are sent as text content,
1741
+ larger/binary files are uploaded via signed URL.
1742
+
1743
+ For directories: Zipped as a single file with ignore pattern support and uploaded
1744
+ as a collection.
1745
+
1746
+ Args:
1747
+ name: Name of the data storage entry
1748
+ file_path: Path to file or directory to upload
1749
+ description: Optional description of the data storage entry
1750
+ path: Optional path for the data storage entry
1751
+ as_collection: If true, upload directories as a single zip file collection.
1752
+ manifest_filename: Name of manifest file
1753
+ ignore_patterns: List of patterns to ignore when zipping directories
1754
+ ignore_filename: Name of ignore file to read from directory (default: .gitignore)
1755
+ project_id: ID of the project this data storage entry belongs to
1756
+
1757
+ Returns:
1758
+ DataStorageResponse containing the final data storage entry
1759
+
1760
+ Raises:
1761
+ DataStorageCreationError: If there's an error in the process
1762
+ """
1763
+ file_path = self._validate_file_path(file_path)
1764
+
1765
+ try:
1766
+ if file_path.is_dir() and as_collection:
1767
+ return self._upload_data_directory(
1768
+ name,
1769
+ file_path,
1770
+ description,
1771
+ path,
1772
+ ignore_patterns,
1773
+ ignore_filename,
1774
+ project_id,
1775
+ )
1776
+ if file_path.is_dir() and not as_collection:
1777
+ responses = self._upload_directory_hierarchically(
1778
+ name=name,
1779
+ dir_path=file_path,
1780
+ description=description,
1781
+ manifest_filename=manifest_filename,
1782
+ ignore_patterns=ignore_patterns,
1783
+ ignore_filename=ignore_filename,
1784
+ project_id=project_id,
1785
+ )
1786
+ if not responses:
1787
+ raise DataStorageCreationError(
1788
+ "No data storage entries were created"
1789
+ )
1790
+ return responses[0]
1791
+ return self._upload_data_single_file(
1792
+ name, file_path, description, path, project_id
1793
+ )
1794
+
1795
+ except HTTPStatusError as e:
1796
+ self._handle_http_errors(e, "creating")
1797
+ except Exception as e:
1798
+ raise DataStorageCreationError(
1799
+ f"An unexpected error occurred during file upload: {e!r}"
1800
+ ) from e
1801
+
1802
+ @retry(
1803
+ stop=stop_after_attempt(3),
1804
+ wait=wait_exponential(multiplier=1, max=10),
1805
+ retry=retry_if_connection_error,
1806
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1807
+ )
1808
+ async def astore_file_content(
1809
+ self,
1810
+ name: str,
1811
+ file_path: str | Path,
1812
+ description: str | None = None,
1813
+ path: str | None = None,
1814
+ as_collection: bool = False,
1815
+ manifest_filename: str | None = None,
1816
+ ignore_patterns: list[str] | None = None,
1817
+ ignore_filename: str = ".gitignore",
1818
+ dataset_id: UUID | None = None,
1819
+ project_id: UUID | None = None,
1820
+ ) -> DataStorageResponse:
1821
+ """Asynchronously store file or directory content in the data storage system.
1822
+
1823
+ Args:
1824
+ name: Name of the data storage entry.
1825
+ file_path: Path to the file or directory to upload.
1826
+ description: Optional description for the entry.
1827
+ path: Optional GCS path for the entry.
1828
+ as_collection: If uploading a directory, `True` zips it into a single collection,
1829
+ `False` uploads it as a hierarchical structure of individual objects.
1830
+ manifest_filename: Optional manifest file for hierarchical uploads.
1831
+ ignore_patterns: List of patterns to ignore when zipping.
1832
+ ignore_filename: Name of ignore file to read (default: .gitignore).
1833
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset.
1834
+ project_id: ID of the project this data storage entry belongs to
1835
+
1836
+ Returns:
1837
+ The `DataStorageResponse` for the created entry. For hierarchical uploads,
1838
+ this is the response for the root directory entry.
1839
+ """
1840
+ file_path = self._validate_file_path(file_path)
1841
+
1842
+ try:
1843
+ if file_path.is_dir():
1844
+ if as_collection:
1845
+ return await self._aupload_data_directory(
1846
+ name,
1847
+ file_path,
1848
+ description,
1849
+ path,
1850
+ ignore_patterns,
1851
+ ignore_filename,
1852
+ project_id,
1853
+ )
1854
+ responses = await self._aupload_directory_hierarchically(
1855
+ name=name,
1856
+ dir_path=file_path,
1857
+ description=description,
1858
+ manifest_filename=manifest_filename,
1859
+ ignore_patterns=ignore_patterns,
1860
+ ignore_filename=ignore_filename,
1861
+ dataset_id=dataset_id,
1862
+ project_id=project_id,
1863
+ )
1864
+ if not responses:
1865
+ raise DataStorageCreationError(
1866
+ "No data storage entries were created"
1867
+ )
1868
+ return responses[0]
1869
+ return await self._aupload_data_single_file(
1870
+ name, file_path, description, path, dataset_id, project_id
1871
+ )
1872
+
1873
+ except HTTPStatusError as e:
1874
+ self._handle_http_errors(e, "creating")
1875
+ except Exception as e:
1876
+ raise DataStorageCreationError(
1877
+ f"An unexpected error occurred during async file upload: {e!r}"
1878
+ ) from e
1879
+
1880
+ @retry(
1881
+ stop=stop_after_attempt(3),
1882
+ wait=wait_exponential(multiplier=1, max=10),
1883
+ retry=retry_if_connection_error,
1884
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1885
+ )
1886
+ def register_existing_data_source(
1887
+ self,
1888
+ name: str,
1889
+ existing_location: DataStorageLocationPayload,
1890
+ description: str | None = None,
1891
+ as_collection: bool = False,
1892
+ path: str | None = None,
1893
+ project_id: UUID | None = None,
1894
+ ) -> DataStorageResponse:
1895
+ """Store content as a string in the data storage system.
1896
+
1897
+ Args:
1898
+ name: Name of the data storage entry
1899
+ existing_location: Describes the existing data source location to register
1900
+ description: Optional description of the data storage entry
1901
+ as_collection: If uploading a directory, `True` creates a single storage entry for
1902
+ the whole directory and multiple storage locations for each file, `False` assumes
1903
+ you are uploading a single file.
1904
+ path: Optional path for the data storage entry
1905
+ project_id: ID of the project this data storage entry belongs to
1906
+
1907
+ Returns:
1908
+ DataStorageResponse containing the created data storage entry and storage locations
1909
+
1910
+ Raises:
1911
+ DataStorageCreationError: If there's an error creating the data storage entry
1912
+ """
1913
+ try:
1914
+ payload = DataStorageRequestPayload(
1915
+ name=name,
1916
+ description=description,
1917
+ path=path,
1918
+ existing_location=existing_location,
1919
+ project_id=project_id,
1920
+ is_collection=as_collection,
1921
+ )
1922
+ response = self.client.post(
1923
+ "/v0.1/data-storage/data-entries",
1924
+ json=payload.model_dump(exclude_none=True),
1925
+ )
1926
+ response.raise_for_status()
1927
+ return DataStorageResponse.model_validate(response.json())
1928
+ except HTTPStatusError as e:
1929
+ self._handle_http_errors(e, "creating")
1930
+ except Exception as e:
1931
+ raise DataStorageCreationError(
1932
+ f"An unexpected error occurred: {e!r}"
1933
+ ) from e
1934
+
1935
+ @retry(
1936
+ stop=stop_after_attempt(3),
1937
+ wait=wait_exponential(multiplier=1, max=10),
1938
+ retry=retry_if_connection_error,
1939
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1940
+ )
1941
+ async def aregister_existing_data_source(
1942
+ self,
1943
+ name: str,
1944
+ existing_location: DataStorageLocationPayload,
1945
+ as_collection: bool = False,
1946
+ description: str | None = None,
1947
+ path: str | None = None,
1948
+ project_id: UUID | None = None,
1949
+ ) -> DataStorageResponse:
1950
+ """Store content as a string in the data storage system.
1951
+
1952
+ Args:
1953
+ name: Name of the data storage entry
1954
+ existing_location: Describes the existing data source location to register
1955
+ description: Optional description of the data storage entry
1956
+ as_collection: If uploading a directory, `True` creates a single storage entry for
1957
+ the whole directory and multiple storage locations for each file, `False` assumes
1958
+ you are uploading a single file.
1959
+ path: Optional path for the data storage entry
1960
+ project_id: ID of the project this data storage entry belongs to
1961
+
1962
+ Returns:
1963
+ DataStorageResponse containing the created data storage entry and storage locations
1964
+
1965
+ Raises:
1966
+ DataStorageCreationError: If there's an error creating the data storage entry
1967
+ """
1968
+ try:
1969
+ payload = DataStorageRequestPayload(
1970
+ name=name,
1971
+ description=description,
1972
+ path=path,
1973
+ existing_location=existing_location,
1974
+ project_id=project_id,
1975
+ is_collection=as_collection,
1976
+ )
1977
+ response = await self.async_client.post(
1978
+ "/v0.1/data-storage/data-entries",
1979
+ json=payload.model_dump(exclude_none=True),
1980
+ )
1981
+ response.raise_for_status()
1982
+ return DataStorageResponse.model_validate(response.json())
1983
+ except HTTPStatusError as e:
1984
+ self._handle_http_errors(e, "creating")
1985
+ except Exception as e:
1986
+ raise DataStorageCreationError(
1987
+ f"An unexpected error occurred: {e!r}"
1988
+ ) from e
1989
+
1990
+ @retry(
1991
+ stop=stop_after_attempt(3),
1992
+ wait=wait_exponential(multiplier=1, max=10),
1993
+ retry=retry_if_connection_error,
1994
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1995
+ )
1996
+ def search_data_storage(
1997
+ self,
1998
+ criteria: list[SearchCriterion] | None = None,
1999
+ size: int = 10,
2000
+ ) -> list[dict]:
2001
+ """Search data storage objects using structured criteria.
2002
+
2003
+ Args:
2004
+ criteria: List of search criteria (SearchCriterion objects with field, operator, value)
2005
+ size: Number of results to return (1-100)
2006
+
2007
+ Returns:
2008
+ List of search results with scores and data storage information
2009
+
2010
+ Raises:
2011
+ DataStorageCreationError: If there's an error searching data storage entries
2012
+
2013
+ Example:
2014
+ from futurehouse_client.models.rest import SearchCriterion, SearchOperator
2015
+ criteria = [
2016
+ SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
2017
+ SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
2018
+ SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
2019
+ ]
2020
+ results = client.search_data_storage(criteria=criteria, size=20)
2021
+ """
2022
+ try:
2023
+ payload = DataStorageSearchPayload(
2024
+ criteria=criteria or [],
2025
+ size=max(1, min(100, size)), # Clamp between 1-100
2026
+ )
2027
+
2028
+ response = self.client.post(
2029
+ "/v0.1/data-storage/search",
2030
+ json=payload.model_dump(mode="json"),
2031
+ )
2032
+ response.raise_for_status()
2033
+ return response.json()
2034
+
2035
+ except HTTPStatusError as e:
2036
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2037
+ raise DataStorageCreationError(
2038
+ "Search functionality is currently unavailable"
2039
+ ) from e
2040
+ self._handle_http_errors(e, "searching")
2041
+ except Exception as e:
2042
+ raise DataStorageCreationError(
2043
+ f"An unexpected error occurred during search: {e!r}"
2044
+ ) from e
2045
+
2046
+ @retry(
2047
+ stop=stop_after_attempt(3),
2048
+ wait=wait_exponential(multiplier=1, max=10),
2049
+ retry=retry_if_connection_error,
2050
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2051
+ )
2052
+ async def asearch_data_storage(
2053
+ self,
2054
+ criteria: list[SearchCriterion] | None = None,
2055
+ size: int = 10,
2056
+ ) -> list[dict]:
2057
+ """Asynchronously search data storage objects using structured criteria.
2058
+
2059
+ Args:
2060
+ criteria: List of search criteria (SearchCriterion objects with field, operator, value)
2061
+ size: Number of results to return (1-100)
2062
+
2063
+ Returns:
2064
+ List of search results with scores and data storage information
2065
+
2066
+ Raises:
2067
+ DataStorageCreationError: If there's an error searching data storage entries
2068
+
2069
+ Example:
2070
+ from futurehouse_client.models.rest import SearchCriterion, SearchOperator
2071
+ criteria = [
2072
+ SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
2073
+ SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
2074
+ SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
2075
+ ]
2076
+ results = await client.asearch_data_storage(criteria=criteria, size=20)
2077
+ """
2078
+ try:
2079
+ payload = DataStorageSearchPayload(
2080
+ criteria=criteria or [],
2081
+ size=max(1, min(100, size)), # Clamp between 1-100
2082
+ )
2083
+
2084
+ response = await self.async_client.post(
2085
+ "/v0.1/data-storage/search",
2086
+ json=payload.model_dump(mode="json"),
2087
+ )
2088
+ response.raise_for_status()
2089
+ return response.json()
2090
+
2091
+ except HTTPStatusError as e:
2092
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2093
+ raise DataStorageCreationError(
2094
+ "Search functionality is currently unavailable"
2095
+ ) from e
2096
+ self._handle_http_errors(e, "searching")
2097
+ except Exception as e:
2098
+ raise DataStorageCreationError(
2099
+ f"An unexpected error occurred during async search: {e!r}"
2100
+ ) from e
2101
+
2102
+ @retry(
2103
+ stop=stop_after_attempt(3),
2104
+ wait=wait_exponential(multiplier=1, max=10),
2105
+ retry=retry_if_connection_error,
2106
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2107
+ )
2108
+ def similarity_search_data_storage(
2109
+ self,
2110
+ embedding: list[float],
2111
+ size: int = 10,
2112
+ min_score: float = 0.7,
2113
+ dataset_id: UUID | None = None,
2114
+ tags: list[str] | None = None,
2115
+ user_id: str | None = None,
2116
+ project_id: str | None = None,
2117
+ ) -> list[dict]:
2118
+ """Search data storage objects using vector similarity.
2119
+
2120
+ Args:
2121
+ embedding: Embedding vector for similarity search
2122
+ size: Number of results to return (1-100)
2123
+ min_score: Minimum similarity score (0.0-1.0)
2124
+ dataset_id: Optional dataset ID filter
2125
+ tags: Optional list of tags to filter by
2126
+ user_id: Optional user ID filter (admin only)
2127
+ project_id: Optional project ID filter
2128
+
2129
+ Returns:
2130
+ List of search results with similarity scores and data storage information
2131
+
2132
+ Raises:
2133
+ DataStorageCreationError: If there's an error performing similarity search
2134
+ """
2135
+ try:
2136
+ # Validate inputs
2137
+ if not embedding:
2138
+ raise DataStorageCreationError("Embedding vector is required")
2139
+
2140
+ if not all(isinstance(x, int | float) for x in embedding):
2141
+ raise DataStorageCreationError("Embedding must be a list of numbers")
2142
+
2143
+ size = max(1, min(100, size)) # Clamp between 1-100
2144
+ min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
2145
+
2146
+ # Build request payload
2147
+ payload = {
2148
+ "embedding": embedding,
2149
+ "size": size,
2150
+ "min_score": min_score,
2151
+ }
2152
+
2153
+ # Add optional filters
2154
+ if dataset_id is not None:
2155
+ payload["dataset_id"] = str(dataset_id)
2156
+ if tags is not None:
2157
+ payload["tags"] = tags
2158
+ if user_id is not None:
2159
+ payload["user_id"] = user_id
2160
+ if project_id is not None:
2161
+ payload["project_id"] = project_id
2162
+
2163
+ response = self.client.post(
2164
+ "/v0.1/data-storage/similarity-search", json=payload
2165
+ )
2166
+ response.raise_for_status()
2167
+ return response.json()
2168
+
2169
+ except HTTPStatusError as e:
2170
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2171
+ raise DataStorageCreationError(
2172
+ "Similarity search functionality is currently unavailable"
2173
+ ) from e
2174
+ self._handle_http_errors(e, "performing similarity search")
2175
+ except Exception as e:
2176
+ raise DataStorageCreationError(
2177
+ f"An unexpected error occurred during similarity search: {e!r}"
2178
+ ) from e
2179
+
2180
+ @retry(
2181
+ stop=stop_after_attempt(3),
2182
+ wait=wait_exponential(multiplier=1, max=10),
2183
+ retry=retry_if_connection_error,
2184
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2185
+ )
2186
+ async def asimilarity_search_data_storage(
2187
+ self,
2188
+ embedding: list[float],
2189
+ size: int = 10,
2190
+ min_score: float = 0.7,
2191
+ dataset_id: UUID | None = None,
2192
+ tags: list[str] | None = None,
2193
+ user_id: str | None = None,
2194
+ project_id: str | None = None,
2195
+ ) -> list[dict]:
2196
+ """Asynchronously search data storage objects using vector similarity.
2197
+
2198
+ Args:
2199
+ embedding: Embedding vector for similarity search
2200
+ size: Number of results to return (1-100)
2201
+ min_score: Minimum similarity score (0.0-1.0)
2202
+ dataset_id: Optional dataset ID filter
2203
+ tags: Optional list of tags to filter by
2204
+ user_id: Optional user ID filter (admin only)
2205
+ project_id: Optional project ID filter
2206
+
2207
+ Returns:
2208
+ List of search results with similarity scores and data storage information
2209
+
2210
+ Raises:
2211
+ DataStorageCreationError: If there's an error performing similarity search
2212
+ """
2213
+ try:
2214
+ # Validate inputs
2215
+ if not embedding:
2216
+ raise DataStorageCreationError("Embedding vector is required")
2217
+
2218
+ if not all(isinstance(x, int | float) for x in embedding):
2219
+ raise DataStorageCreationError("Embedding must be a list of numbers")
2220
+
2221
+ size = max(1, min(100, size)) # Clamp between 1-100
2222
+ min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
2223
+
2224
+ # Build request payload
2225
+ payload = {
2226
+ "embedding": embedding,
2227
+ "size": size,
2228
+ "min_score": min_score,
2229
+ }
2230
+
2231
+ # Add optional filters
2232
+ if dataset_id is not None:
2233
+ payload["dataset_id"] = str(dataset_id)
2234
+ if tags is not None:
2235
+ payload["tags"] = tags
2236
+ if user_id is not None:
2237
+ payload["user_id"] = user_id
2238
+ if project_id is not None:
2239
+ payload["project_id"] = project_id
2240
+
2241
+ response = await self.async_client.post(
2242
+ "/v0.1/data-storage/similarity-search", json=payload
2243
+ )
2244
+ response.raise_for_status()
2245
+ return response.json()
2246
+
2247
+ except HTTPStatusError as e:
2248
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2249
+ raise DataStorageCreationError(
2250
+ "Similarity search functionality is currently unavailable"
2251
+ ) from e
2252
+ self._handle_http_errors(e, "performing similarity search")
2253
+ except Exception as e:
2254
+ raise DataStorageCreationError(
2255
+ f"An unexpected error occurred during async similarity search: {e!r}"
2256
+ ) from e
2257
+
2258
+ @retry(
2259
+ stop=stop_after_attempt(3),
2260
+ wait=wait_exponential(multiplier=1, max=10),
2261
+ retry=retry_if_connection_error,
2262
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2263
+ )
2264
+ def fetch_data_from_storage(
2265
+ self,
2266
+ data_storage_id: UUID | None = None,
2267
+ ) -> str | Path | list[Path] | None:
2268
+ """Fetch data from the storage system (sync version).
2269
+
2270
+ Args:
2271
+ data_storage_id: ID of the data storage entry to fetch
2272
+
2273
+ Returns:
2274
+ For PG_TABLE storage: string content
2275
+ For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
2276
+ For multi-location entries: dict of location IDs to dicts with signed URL and file name
2277
+ None if not found or error occurred
2278
+ """
2279
+ if not data_storage_id:
2280
+ raise DataStorageRetrievalError(
2281
+ "data_storage_id must be provided at this time"
2282
+ )
2283
+
2284
+ try:
2285
+ response = self.client.get(
2286
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
2287
+ )
2288
+ response.raise_for_status()
2289
+ result = DataStorageResponse.model_validate(response.json())
2290
+
2291
+ if len(result.storage_locations) > 1:
2292
+ return [
2293
+ self._download_from_gcs(
2294
+ location.storage_config.signed_url or "",
2295
+ (location.storage_config.location or "").split("/")[-1],
2296
+ )
2297
+ for location in result.storage_locations
2298
+ ]
2299
+
2300
+ # Most scenarios will only have one location
2301
+ storage_location = result.storage_locations[0]
2302
+ storage_type = storage_location.storage_config.storage_type
2303
+
2304
+ if storage_type == "gcs":
2305
+ if not storage_location.storage_config.signed_url:
2306
+ raise DataStorageRetrievalError(
2307
+ "No signed URL available for GCS download"
2308
+ )
2309
+
2310
+ return self._download_from_gcs(
2311
+ storage_location.storage_config.signed_url
2312
+ )
2313
+
2314
+ if storage_type in {"raw_content", "pg_table"}:
2315
+ content = result.data_storage.content
2316
+ if content is None:
2317
+ logger.warning(
2318
+ f"No content found for data storage entry {data_storage_id}"
2319
+ )
2320
+ return None
2321
+ return content
2322
+
2323
+ raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
2324
+
2325
+ except HTTPStatusError as e:
2326
+ self._handle_http_errors(e, "retrieving")
2327
+ except Exception as e:
2328
+ raise DataStorageRetrievalError(
2329
+ f"An unexpected error occurred: {e!r}"
2330
+ ) from e
2331
+
2332
+ @retry(
2333
+ stop=stop_after_attempt(3),
2334
+ wait=wait_exponential(multiplier=1, max=10),
2335
+ retry=retry_if_connection_error,
2336
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2337
+ )
2338
+ async def afetch_data_from_storage(
2339
+ self,
2340
+ data_storage_id: UUID | None = None,
2341
+ ) -> str | Path | list[Path] | None:
2342
+ """Fetch data from the storage system.
2343
+
2344
+ Args:
2345
+ data_storage_id: ID of the data storage entry to fetch
2346
+
2347
+ Returns:
2348
+ For PG_TABLE storage: string content
2349
+ For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
2350
+ For multi-location entries: dict of location IDs to dicts with signed URL and file name
2351
+ None if not found or error occurred
2352
+ """
2353
+ if not data_storage_id:
2354
+ raise DataStorageRetrievalError(
2355
+ "data_storage_id must be provided at this time"
2356
+ )
2357
+
2358
+ try:
2359
+ response = await self.async_client.get(
2360
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
2361
+ )
2362
+ response.raise_for_status()
2363
+ result = DataStorageResponse.model_validate(response.json())
2364
+
2365
+ if len(result.storage_locations) > 1:
2366
+ return await gather_with_concurrency(
2367
+ DOWNLOAD_CONCURRENCY,
2368
+ [
2369
+ self._adownload_from_gcs(
2370
+ location.storage_config.signed_url or "",
2371
+ (location.storage_config.location or "").split("/")[-1],
2372
+ )
2373
+ for location in result.storage_locations
2374
+ ],
2375
+ )
2376
+
2377
+ # Most scenarios will only have one location
2378
+ storage_location = result.storage_locations[0]
2379
+ storage_type = storage_location.storage_config.storage_type
2380
+
2381
+ if storage_type == "gcs":
2382
+ if not storage_location.storage_config.signed_url:
2383
+ raise DataStorageRetrievalError(
2384
+ "No signed URL available for GCS download"
2385
+ )
2386
+
2387
+ return await self._adownload_from_gcs(
2388
+ storage_location.storage_config.signed_url
2389
+ )
2390
+
2391
+ if storage_type in {"raw_content", "pg_table"}:
2392
+ content = result.data_storage.content
2393
+ if content is None:
2394
+ logger.warning(
2395
+ f"No content found for data storage entry {data_storage_id}"
2396
+ )
2397
+ return None
2398
+ return content
2399
+
2400
+ raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
2401
+
2402
+ except HTTPStatusError as e:
2403
+ self._handle_http_errors(e, "retrieving")
2404
+ except Exception as e:
2405
+ raise DataStorageRetrievalError(
2406
+ f"An unexpected error occurred: {e!r}"
2407
+ ) from e
2408
+
2409
+ @retry(
2410
+ stop=stop_after_attempt(3),
2411
+ wait=wait_exponential(multiplier=1, max=10),
2412
+ retry=retry_if_connection_error,
2413
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2414
+ )
2415
+ async def acreate_dataset(
2416
+ self,
2417
+ name: str,
2418
+ description: str | None = None,
2419
+ dataset_id: UUID | None = None,
2420
+ ):
2421
+ try:
2422
+ payload = CreateDatasetPayload(
2423
+ name=name,
2424
+ description=description,
2425
+ id=dataset_id,
2426
+ )
2427
+ response = await self.async_client.post(
2428
+ "/v0.1/data-storage/datasets",
2429
+ json=payload.model_dump(exclude_none=True),
2430
+ )
2431
+ response.raise_for_status()
2432
+ return CreateDatasetPayload.model_validate(response.json())
2433
+ except HTTPStatusError as e:
2434
+ self._handle_http_errors(e, "creating")
2435
+ except Exception as e:
2436
+ raise DataStorageCreationError(
2437
+ f"An unexpected error occurred: {e!r}"
2438
+ ) from e
2439
+
2440
+ @retry(
2441
+ stop=stop_after_attempt(3),
2442
+ wait=wait_exponential(multiplier=1, max=10),
2443
+ retry=retry_if_connection_error,
2444
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2445
+ )
2446
+ def create_dataset(
2447
+ self,
2448
+ name: str,
2449
+ description: str | None = None,
2450
+ dataset_id: UUID | None = None,
2451
+ ):
2452
+ try:
2453
+ payload = CreateDatasetPayload(
2454
+ name=name,
2455
+ description=description,
2456
+ id=dataset_id,
2457
+ )
2458
+ response = self.client.post(
2459
+ "/v0.1/data-storage/datasets",
2460
+ json=payload.model_dump(exclude_none=True),
2461
+ )
2462
+ response.raise_for_status()
2463
+ return CreateDatasetPayload.model_validate(response.json())
2464
+ except HTTPStatusError as e:
2465
+ self._handle_http_errors(e, "creating")
2466
+ except Exception as e:
2467
+ raise DataStorageCreationError(
2468
+ f"An unexpected error occurred: {e!r}"
2469
+ ) from e
2470
+
2471
+ @retry(
2472
+ stop=stop_after_attempt(3),
2473
+ wait=wait_exponential(multiplier=1, max=10),
2474
+ retry=retry_if_connection_error,
2475
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2476
+ )
2477
+ async def adelete_dataset(self, dataset_id: UUID):
2478
+ """Delete a dataset.
2479
+
2480
+ Note: This will delete all data storage entries associated with the dataset.
2481
+
2482
+ Args:
2483
+ dataset_id: ID of the dataset to delete
2484
+
2485
+ Raises:
2486
+ DataStorageError: If there's an error deleting the dataset
2487
+ """
2488
+ try:
2489
+ await self.async_client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
2490
+ except HTTPStatusError as e:
2491
+ self._handle_http_errors(e, "deleting")
2492
+ except Exception as e:
2493
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2494
+
2495
+ @retry(
2496
+ stop=stop_after_attempt(3),
2497
+ wait=wait_exponential(multiplier=1, max=10),
2498
+ retry=retry_if_connection_error,
2499
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2500
+ )
2501
+ def delete_dataset(self, dataset_id: UUID):
2502
+ """Delete a dataset.
2503
+
2504
+ Note: This will delete all data storage entries associated with the dataset.
2505
+
2506
+ Args:
2507
+ dataset_id: ID of the dataset to delete
2508
+
2509
+ Raises:
2510
+ DataStorageError: If there's an error deleting the dataset
2511
+ """
2512
+ try:
2513
+ self.client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
2514
+ except HTTPStatusError as e:
2515
+ self._handle_http_errors(e, "deleting")
2516
+ except Exception as e:
2517
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2518
+
2519
+ @retry(
2520
+ stop=stop_after_attempt(3),
2521
+ wait=wait_exponential(multiplier=1, max=10),
2522
+ retry=retry_if_connection_error,
2523
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2524
+ )
2525
+ async def aget_dataset(self, dataset_id: UUID):
2526
+ try:
2527
+ response = await self.async_client.get(
2528
+ f"/v0.1/data-storage/datasets/{dataset_id}"
2529
+ )
2530
+ response.raise_for_status()
2531
+
2532
+ return response.json()
2533
+ except HTTPStatusError as e:
2534
+ self._handle_http_errors(e, "retrieving")
2535
+ except Exception as e:
2536
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2537
+
2538
+ @retry(
2539
+ stop=stop_after_attempt(3),
2540
+ wait=wait_exponential(multiplier=1, max=10),
2541
+ retry=retry_if_connection_error,
2542
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2543
+ )
2544
+ def get_dataset(self, dataset_id: UUID):
2545
+ try:
2546
+ response = self.client.get(f"/v0.1/data-storage/datasets/{dataset_id}")
2547
+ response.raise_for_status()
2548
+
2549
+ return response.json()
2550
+ except HTTPStatusError as e:
2551
+ self._handle_http_errors(e, "retrieving")
2552
+ except Exception as e:
2553
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2554
+
2555
+ @retry(
2556
+ stop=stop_after_attempt(3),
2557
+ wait=wait_exponential(multiplier=1, max=10),
2558
+ retry=retry_if_connection_error,
2559
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2560
+ )
2561
+ def get_data_storage_entry(self, data_storage_id: UUID) -> DataStorageResponse:
2562
+ """Get a data storage entry with all details including storage locations and metadata.
2563
+
2564
+ Args:
2565
+ data_storage_id: ID of the data storage entry to retrieve
2566
+
2567
+ Returns:
2568
+ DataStorageResponse with entry details and storage locations
2569
+
2570
+ Raises:
2571
+ DataStorageRetrievalError: If there's an error retrieving the entry
2572
+ """
2573
+ try:
2574
+ response = self.client.get(
2575
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
2576
+ )
2577
+ response.raise_for_status()
2578
+ return DataStorageResponse.model_validate(response.json())
2579
+ except HTTPStatusError as e:
2580
+ self._handle_http_errors(e, "retrieving")
2581
+ except Exception as e:
2582
+ raise DataStorageRetrievalError(
2583
+ f"An unexpected error occurred: {e!r}"
2584
+ ) from e
2585
+
2586
+ @retry(
2587
+ stop=stop_after_attempt(3),
2588
+ wait=wait_exponential(multiplier=1, max=10),
2589
+ retry=retry_if_connection_error,
2590
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2591
+ )
2592
+ async def aget_data_storage_entry(
2593
+ self, data_storage_id: UUID
2594
+ ) -> DataStorageResponse:
2595
+ """Get a data storage entry with all details including storage locations and metadata.
2596
+
2597
+ Args:
2598
+ data_storage_id: ID of the data storage entry to retrieve
2599
+
2600
+ Returns:
2601
+ DataStorageResponse with entry details and storage locations
2602
+
2603
+ Raises:
2604
+ DataStorageRetrievalError: If there's an error retrieving the entry
2605
+ """
2606
+ try:
2607
+ response = await self.async_client.get(
2608
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
2609
+ )
2610
+ response.raise_for_status()
2611
+ return DataStorageResponse.model_validate(response.json())
2612
+ except HTTPStatusError as e:
2613
+ self._handle_http_errors(e, "retrieving")
2614
+ except Exception as e:
2615
+ raise DataStorageRetrievalError(
2616
+ f"An unexpected error occurred: {e!r}"
2617
+ ) from e
2618
+
2619
+ @retry(
2620
+ stop=stop_after_attempt(3),
2621
+ wait=wait_exponential(multiplier=1, max=10),
2622
+ retry=retry_if_connection_error,
2623
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2624
+ )
2625
+ async def adelete_data_storage_entry(self, data_storage_entry_id: UUID):
2626
+ try:
2627
+ await self.async_client.delete(
2628
+ f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
2629
+ )
2630
+ except HTTPStatusError as e:
2631
+ self._handle_http_errors(e, "deleting")
2632
+ except Exception as e:
2633
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2634
+
2635
+ @retry(
2636
+ stop=stop_after_attempt(3),
2637
+ wait=wait_exponential(multiplier=1, max=10),
2638
+ retry=retry_if_connection_error,
2639
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2640
+ )
2641
+ def delete_data_storage_entry(self, data_storage_entry_id: UUID):
2642
+ try:
2643
+ self.client.delete(
2644
+ f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
2645
+ )
2646
+ except HTTPStatusError as e:
2647
+ self._handle_http_errors(e, "deleting")
2648
+ except Exception as e:
2649
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e