edison-client 0.6.8.dev92__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3309 @@
1
+ import asyncio
2
+ import fnmatch
3
+ import json
4
+ import logging
5
+ import shutil
6
+ import tempfile
7
+ import zipfile
8
+ from os import PathLike
9
+ from pathlib import Path
10
+ from typing import Any, NoReturn
11
+ from uuid import UUID
12
+
13
+ import aiofiles
14
+ import aiohttp
15
+ import requests as requests_lib
16
+ from google.resumable_media import requests as resumable_requests
17
+ from httpx import AsyncClient, Client, HTTPStatusError, codes
18
+ from lmi.utils import gather_with_concurrency
19
+ from pydantic import HttpUrl
20
+ from requests.adapters import HTTPAdapter
21
+ from tenacity import (
22
+ before_sleep_log,
23
+ retry,
24
+ stop_after_attempt,
25
+ wait_exponential,
26
+ )
27
+ from tqdm import tqdm
28
+ from urllib3.util.retry import Retry
29
+
30
+ from edison_client.models.data_storage_methods import (
31
+ CreateDatasetPayload,
32
+ DataContentType,
33
+ DataStorageLocationPayload,
34
+ DataStorageRequestPayload,
35
+ DataStorageResponse,
36
+ DataStorageType,
37
+ DirectoryManifest,
38
+ GetDatasetAndEntriesResponse,
39
+ ManifestEntry,
40
+ PermittedAccessors,
41
+ RawFetchResponse,
42
+ ShareStatus,
43
+ )
44
+ from edison_client.models.rest import (
45
+ DataStorageSearchPayload,
46
+ FilterLogic,
47
+ SearchCriterion,
48
+ )
49
+ from edison_client.utils.general import retry_if_connection_error
50
+
51
+ # this is only required if they're using a yaml manifest
52
+ try:
53
+ import yaml
54
+ except ImportError:
55
+ yaml = None # type: ignore[assignment]
56
+
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+ # TODO: pdf support, unsure what package we want to use
61
+ SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT = ["txt", "md", "csv", "json", "yaml", "yml"]
62
+ CHUNK_SIZE = 8 * 1024 * 1024 # 8MB
63
+ MAX_RETRIES = 3
64
+ SMALL_FILE_THRESHOLD_BYTES = 10 * 1024 * 1024 # 10MB
65
+ HTTP_RESUME_INCOMPLETE = 308
66
+ INITIATE_HEADERS = {
67
+ "Content-Type": "application/octet-stream",
68
+ "x-goog-resumable": "start",
69
+ "Content-Length": "0",
70
+ }
71
+ DOWNLOAD_CONCURRENCY = 3
72
+
73
+
74
+ def _should_ignore_file(
75
+ file_path: Path | PathLike,
76
+ base_path: Path | PathLike,
77
+ ignore_patterns: list[str] | None = None,
78
+ ) -> bool:
79
+ """Check if a file should be ignored based on ignore patterns.
80
+
81
+ Args:
82
+ file_path: Path to the file to check
83
+ base_path: Base directory path
84
+ ignore_patterns: List of ignore patterns (supports gitignore-style patterns)
85
+
86
+ Returns:
87
+ True if file should be ignored
88
+ """
89
+ if not ignore_patterns:
90
+ return False
91
+
92
+ try:
93
+ file_path = Path(file_path)
94
+ base_path = Path(base_path)
95
+ rel_path = file_path.relative_to(base_path)
96
+ rel_path_str = str(rel_path)
97
+
98
+ for pattern in ignore_patterns:
99
+ pattern = pattern.strip()
100
+ if not pattern or pattern.startswith("#"):
101
+ continue
102
+
103
+ is_absolute_match = pattern.startswith("/") and rel_path_str.startswith(
104
+ pattern[1:]
105
+ )
106
+ is_nested_match = "/" in pattern and pattern in rel_path_str
107
+ is_name_match = fnmatch.fnmatch(file_path.name, pattern)
108
+ is_part_match = pattern in rel_path.parts
109
+
110
+ if is_absolute_match or is_nested_match or is_name_match or is_part_match:
111
+ return True
112
+
113
+ except ValueError:
114
+ pass
115
+
116
+ return False
117
+
118
+
119
+ def _read_ignore_file(dir_path: Path, ignore_filename: str = ".gitignore") -> list[str]:
120
+ """Read ignore patterns from a file in the directory.
121
+
122
+ Args:
123
+ dir_path: Directory to look for ignore file
124
+ ignore_filename: Name of ignore file to read
125
+
126
+ Returns:
127
+ List of ignore patterns
128
+ """
129
+ ignore_file = dir_path / ignore_filename
130
+ if ignore_file.exists():
131
+ try:
132
+ with open(ignore_file, encoding="utf-8") as f:
133
+ return [line.strip() for line in f]
134
+ except Exception as e:
135
+ logger.warning(f"Failed to read {ignore_filename}: {e}")
136
+ return []
137
+ else:
138
+ return []
139
+
140
+
141
+ def _collect_ignore_patterns(
142
+ dir_path: Path,
143
+ ignore_patterns: list[str] | None = None,
144
+ ignore_filename: str = ".gitignore",
145
+ ) -> list[str]:
146
+ """Collect all ignore patterns from multiple sources.
147
+
148
+ Args:
149
+ dir_path: Directory to check for ignore files
150
+ ignore_patterns: Explicit ignore patterns
151
+ ignore_filename: Name of ignore file to read from directory
152
+
153
+ Returns:
154
+ Combined list of ignore patterns
155
+ """
156
+ all_ignore_patterns = ignore_patterns or []
157
+ file_patterns = _read_ignore_file(dir_path, ignore_filename)
158
+ all_ignore_patterns.extend(file_patterns)
159
+
160
+ default_ignores = [".git", "__pycache__", "*.pyc", ".DS_Store", "node_modules"]
161
+ all_ignore_patterns.extend(default_ignores)
162
+
163
+ return all_ignore_patterns
164
+
165
+
166
+ def _create_directory_zip(
167
+ dir_path: Path,
168
+ zip_path: Path,
169
+ ignore_patterns: list[str] | None = None,
170
+ ignore_filename: str = ".gitignore",
171
+ ) -> int:
172
+ """Create a zip file from a directory with ignore patterns.
173
+
174
+ Args:
175
+ dir_path: Directory to zip
176
+ zip_path: Output zip file path
177
+ ignore_patterns: Explicit ignore patterns
178
+ ignore_filename: Name of ignore file to read from directory
179
+
180
+ Returns:
181
+ Size of created zip file in bytes
182
+ """
183
+ all_ignore_patterns = _collect_ignore_patterns(
184
+ dir_path, ignore_patterns, ignore_filename
185
+ )
186
+
187
+ logger.debug(f"Creating zip with ignore patterns: {all_ignore_patterns}")
188
+
189
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
190
+ for file_path in dir_path.rglob("*"):
191
+ if file_path.is_file() and not _should_ignore_file(
192
+ file_path, dir_path, all_ignore_patterns
193
+ ):
194
+ arcname = file_path.relative_to(dir_path)
195
+ zipf.write(file_path, arcname)
196
+ logger.debug(f"Added to zip: {arcname}")
197
+
198
+ zip_size = zip_path.stat().st_size
199
+ logger.debug(f"Created zip file {zip_path} with size {zip_size:,} bytes")
200
+ return zip_size
201
+
202
+
203
+ def _should_send_as_text_content(file_path: Path, file_size: int) -> bool:
204
+ """Check if a file should be sent as text content instead of file upload.
205
+
206
+ Args:
207
+ file_path: Path to the file
208
+ file_size: Size of file in bytes
209
+
210
+ Returns:
211
+ True if file should be sent as text content
212
+ """
213
+ # small files can be treated as raw text
214
+ if file_size >= SMALL_FILE_THRESHOLD_BYTES:
215
+ return False
216
+
217
+ file_extension = file_path.suffix.lower().lstrip(".")
218
+ return file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT
219
+
220
+
221
+ def _extract_text_from_file(file_path: Path) -> str | None:
222
+ """Extract text content from a file.
223
+
224
+ Args:
225
+ file_path: Path to the file
226
+
227
+ Returns:
228
+ Extracted text content or None if extraction failed
229
+ """
230
+ file_extension = file_path.suffix.lower().lstrip(".")
231
+
232
+ if file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT:
233
+ try:
234
+ return file_path.read_text(encoding="utf-8")
235
+ except Exception as e:
236
+ logger.warning(f"Failed to extract text from {file_path}: {e}")
237
+ return None
238
+ else:
239
+ return None
240
+
241
+
242
+ def _setup_upload_progress(file_path: Path, file_size: int, progress_bar: tqdm) -> None:
243
+ """Common setup for upload progress tracking."""
244
+ logger.debug(
245
+ f"Starting resumable upload for file: {file_path} (size: {file_size:,} bytes)"
246
+ )
247
+ progress_bar.set_description(f"Uploading {file_path.name}")
248
+ progress_bar.refresh()
249
+
250
+
251
+ async def _initiate_resumable_session(
252
+ session: aiohttp.ClientSession, signed_url: str
253
+ ) -> str:
254
+ """Initiate resumable upload session and return session URI."""
255
+ logger.debug("Initiating resumable upload session")
256
+ async with session.post(signed_url, headers=INITIATE_HEADERS) as initiate_response:
257
+ if initiate_response.status not in {200, 201}:
258
+ error_text = await initiate_response.text()
259
+ logger.error(
260
+ f"Failed to initiate resumable session: {initiate_response.status}"
261
+ )
262
+ logger.error(f"Response: {error_text}")
263
+ initiate_response.raise_for_status()
264
+
265
+ return _validate_session_uri(initiate_response.headers.get("location"))
266
+
267
+
268
+ # TODO: temp
269
+ def _log_upload_debug(signed_url: str) -> None:
270
+ """Common debug logging for uploads."""
271
+ logger.debug(f"Signed URL: {signed_url[:100]}...")
272
+
273
+
274
+ # TODO: temp
275
+ def _validate_session_uri(session_uri: str | None) -> str:
276
+ """Validate and return session URI or raise exception."""
277
+ if not session_uri:
278
+ raise DataStorageError(
279
+ "No session URI returned from resumable upload initiation"
280
+ )
281
+ logger.debug(f"Resumable session initiated. Session URI: {session_uri[:100]}...")
282
+ return session_uri
283
+
284
+
285
+ async def _upload_chunk_with_retry(
286
+ session: aiohttp.ClientSession,
287
+ session_uri: str,
288
+ chunk_data: bytes,
289
+ range_start: int,
290
+ file_size: int,
291
+ progress_bar: tqdm,
292
+ ) -> int:
293
+ """Upload a single chunk with retry logic."""
294
+ range_end = range_start + len(chunk_data) - 1
295
+ chunk_headers = {
296
+ "Content-Type": "application/octet-stream",
297
+ "Content-Length": str(len(chunk_data)),
298
+ "Content-Range": f"bytes {range_start}-{range_end}/{file_size}",
299
+ }
300
+
301
+ for attempt in range(MAX_RETRIES):
302
+ try:
303
+ async with session.put(
304
+ session_uri, data=chunk_data, headers=chunk_headers
305
+ ) as chunk_response:
306
+ if chunk_response.status == HTTP_RESUME_INCOMPLETE:
307
+ progress_bar.update(len(chunk_data))
308
+ logger.debug(f"Uploaded chunk: {range_end + 1}/{file_size} bytes")
309
+ return len(chunk_data)
310
+ if chunk_response.status in {200, 201}:
311
+ progress_bar.update(len(chunk_data))
312
+ logger.debug(
313
+ f"Upload completed successfully. Final response: {chunk_response.status}"
314
+ )
315
+ return len(chunk_data)
316
+
317
+ error_text = await chunk_response.text()
318
+ logger.warning(
319
+ f"Chunk upload failed (attempt {attempt + 1}/{MAX_RETRIES}): {chunk_response.status}"
320
+ )
321
+ logger.warning(f"Response: {error_text}")
322
+ if attempt == MAX_RETRIES - 1:
323
+ chunk_response.raise_for_status()
324
+
325
+ except (TimeoutError, aiohttp.ClientError) as e:
326
+ logger.warning(
327
+ f"Chunk upload error (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
328
+ )
329
+ if attempt == MAX_RETRIES - 1:
330
+ raise
331
+ await asyncio.sleep(2**attempt)
332
+
333
+ return 0
334
+
335
+
336
+ async def _aupload_file_with_progress(
337
+ signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
338
+ ) -> None:
339
+ """Upload a file asynchronously using aiohttp with signed URL initiation."""
340
+ _setup_upload_progress(file_path, file_size, progress_bar)
341
+ _log_upload_debug(signed_url)
342
+
343
+ try:
344
+ retry_config = aiohttp.ClientTimeout(
345
+ total=max(600.0, file_size / (512 * 1024)), connect=30, sock_read=30
346
+ )
347
+ connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
348
+
349
+ async with aiohttp.ClientSession(
350
+ connector=connector, timeout=retry_config
351
+ ) as session:
352
+ session_uri = await _initiate_resumable_session(session, signed_url)
353
+
354
+ async with aiofiles.open(file_path, "rb") as file_obj:
355
+ bytes_uploaded = 0
356
+
357
+ while bytes_uploaded < file_size:
358
+ remaining = file_size - bytes_uploaded
359
+ current_chunk_size = min(CHUNK_SIZE, remaining)
360
+ chunk_data = await file_obj.read(current_chunk_size)
361
+
362
+ if not chunk_data:
363
+ break
364
+
365
+ uploaded_bytes = await _upload_chunk_with_retry(
366
+ session,
367
+ session_uri,
368
+ chunk_data,
369
+ bytes_uploaded,
370
+ file_size,
371
+ progress_bar,
372
+ )
373
+ bytes_uploaded += uploaded_bytes
374
+
375
+ if bytes_uploaded >= file_size:
376
+ break
377
+
378
+ logger.debug("Upload completed successfully")
379
+
380
+ except Exception as e:
381
+ logger.error(f"Async resumable upload error: {type(e).__name__}: {e}")
382
+ raise
383
+
384
+
385
+ def _upload_file_with_progress(
386
+ signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
387
+ ) -> None:
388
+ """Upload a file synchronously using google.resumable_media with signed URL initiation."""
389
+ _setup_upload_progress(file_path, file_size, progress_bar)
390
+ _log_upload_debug(signed_url)
391
+
392
+ try:
393
+ session = requests_lib.Session()
394
+ retry_strategy = Retry(
395
+ total=MAX_RETRIES,
396
+ backoff_factor=2,
397
+ status_forcelist=[429, 500, 502, 503, 504],
398
+ allowed_methods=["POST", "PUT", "PATCH"],
399
+ )
400
+ adapter = HTTPAdapter(max_retries=retry_strategy)
401
+ session.mount("http://", adapter)
402
+ session.mount("https://", adapter)
403
+
404
+ logger.debug("Initiating resumable upload session")
405
+ initiate_response = session.post(
406
+ signed_url, headers=INITIATE_HEADERS, timeout=30
407
+ )
408
+
409
+ if initiate_response.status_code not in {200, 201}:
410
+ logger.error(
411
+ f"Failed to initiate resumable session: {initiate_response.status_code}"
412
+ )
413
+ logger.error(f"Response: {initiate_response.text}")
414
+ initiate_response.raise_for_status()
415
+
416
+ session_uri = _validate_session_uri(initiate_response.headers.get("location"))
417
+
418
+ with open(file_path, "rb") as file_obj:
419
+ upload = resumable_requests.ResumableUpload(
420
+ upload_url=signed_url, chunk_size=CHUNK_SIZE
421
+ )
422
+
423
+ upload._resumable_url = session_uri
424
+ upload._stream = file_obj
425
+ upload._total_bytes = file_size
426
+
427
+ wrapped_file = ProgressWrapper(file_obj, progress_bar)
428
+ upload._stream = wrapped_file
429
+
430
+ while not upload.finished:
431
+ try:
432
+ upload.transmit_next_chunk(session)
433
+ except Exception as e:
434
+ logger.error(f"Chunk upload failed: {e}")
435
+ raise
436
+
437
+ logger.debug("Upload completed successfully using resumable_media library")
438
+
439
+ except Exception as e:
440
+ logger.error(f"Sync resumable upload error: {type(e).__name__}: {e}")
441
+ raise
442
+
443
+
444
+ class RestClientError(Exception):
445
+ """Base exception for REST client errors."""
446
+
447
+
448
+ class DataStorageError(RestClientError):
449
+ """Base exception for data storage operations."""
450
+
451
+
452
+ class DataStorageCreationError(DataStorageError):
453
+ """Raised when there's an error creating a data storage entry."""
454
+
455
+
456
+ class DataStorageRetrievalError(DataStorageError):
457
+ """Raised when there's an error retrieving a data storage entry."""
458
+
459
+
460
+ class ProgressWrapper:
461
+ """Common progress wrapper for file uploads."""
462
+
463
+ def __init__(self, file_obj, progress_bar):
464
+ self.file_obj = file_obj
465
+ self.progress_bar = progress_bar
466
+ self.bytes_read = 0
467
+
468
+ def read(self, size=-1):
469
+ data = self.file_obj.read(size)
470
+ if data:
471
+ self.bytes_read += len(data)
472
+ current_pos = self.file_obj.tell()
473
+ if current_pos > self.progress_bar.n:
474
+ self.progress_bar.update(current_pos - self.progress_bar.n)
475
+ return data
476
+
477
+ def seek(self, offset, whence=0):
478
+ return self.file_obj.seek(offset, whence)
479
+
480
+ def tell(self):
481
+ return self.file_obj.tell()
482
+
483
+
484
+ class DataStorageMethods:
485
+ """Data storage methods for RestClient.
486
+
487
+ This class contains methods for interacting with the data storage API endpoints.
488
+ """
489
+
490
+ # needed for mypy `NoReturn`
491
+ def _handle_http_errors(self, e: HTTPStatusError, operation: str) -> NoReturn:
492
+ """Handle common HTTP errors for data storage operations."""
493
+ if e.response.status_code == codes.FORBIDDEN:
494
+ raise DataStorageError(
495
+ f"Error {operation} data storage entry, not authorized"
496
+ ) from e
497
+ if e.response.status_code == codes.UNPROCESSABLE_ENTITY:
498
+ raise DataStorageError(f"Invalid request payload: {e.response.text}") from e
499
+ raise DataStorageError(
500
+ f"Error {operation} data storage entry: {e.response.status_code} - {e.response.text}"
501
+ ) from e
502
+
503
+ def _validate_file_path(self, file_path: str | Path) -> Path:
504
+ """Validate file path exists and return Path object."""
505
+ file_path = Path(file_path)
506
+ if not file_path.exists():
507
+ raise DataStorageError(f"File or directory not found: {file_path}")
508
+ return file_path
509
+
510
+ def _build_zip_path(
511
+ self, name: str, path_override: str | Path | None
512
+ ) -> str | Path:
513
+ """Build GCS path for zip file."""
514
+ zip_filename = name if name.endswith(".zip") else f"{name}.zip"
515
+ if path_override:
516
+ if isinstance(path_override, str):
517
+ return f"{path_override.rstrip('/')}/{zip_filename}"
518
+ return path_override / zip_filename
519
+ return zip_filename
520
+
521
+ # TODO: methods in here need to be moved to fh tools
522
+ # =====================================
523
+ def _is_zip_file(self, file_path: Path) -> bool:
524
+ """Check if a file is a zip file by checking its magic bytes and excluding Office document formats."""
525
+ # File extensions that should not be treated as ZIP archives even if they have PK magic bytes
526
+ OFFICE_DOCUMENT_EXTENSIONS = {
527
+ ".xlsx",
528
+ ".xlsm",
529
+ ".xlsb", # Excel formats
530
+ ".docx",
531
+ ".docm", # Word formats
532
+ ".pptx",
533
+ ".pptm", # PowerPoint formats
534
+ ".odt",
535
+ ".ods",
536
+ ".odp", # OpenDocument formats
537
+ ".pages",
538
+ ".numbers",
539
+ ".key", # Apple iWork formats
540
+ }
541
+
542
+ # First check file extension to exclude Office documents
543
+ if file_path.suffix.lower() in OFFICE_DOCUMENT_EXTENSIONS:
544
+ return False
545
+
546
+ # Then check magic bytes for actual ZIP files
547
+ try:
548
+ with open(file_path, "rb") as f:
549
+ magic = f.read(2)
550
+ return magic == b"PK"
551
+ except Exception:
552
+ return False
553
+
554
+ def _extract_zip_file(self, zip_path: Path, extract_to: Path) -> Path:
555
+ """Extract a zip file and return the path to the extracted content.
556
+
557
+ Args:
558
+ zip_path: Path to the zip file
559
+ extract_to: Directory to extract to
560
+
561
+ Returns:
562
+ Path to the extracted content (directory or single file)
563
+ """
564
+ extract_dir = extract_to
565
+ extract_dir.mkdir(exist_ok=True)
566
+
567
+ try:
568
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
569
+ zip_ref.extractall(extract_dir)
570
+ except FileExistsError:
571
+ logger.warning(f"File {zip_path} already exists in {extract_dir}")
572
+ extract_dir = extract_dir / "extracted"
573
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
574
+ zip_ref.extractall(extract_dir)
575
+
576
+ extracted_items = list(extract_dir.iterdir())
577
+
578
+ # Delete the zip file
579
+ zip_path.unlink()
580
+
581
+ if len(extracted_items) == 1:
582
+ return extracted_items[0]
583
+ return extract_dir
584
+
585
+ async def _adownload_from_gcs(
586
+ self, signed_url: str, file_name: str | None = None
587
+ ) -> Path:
588
+ """Download file from GCS using signed URL and handle unzipping if needed.
589
+
590
+ Args:
591
+ signed_url: The signed URL to download from
592
+ file_name: The name of the file to download
593
+
594
+ Returns:
595
+ Path to the downloaded file (or unzipped directory if it was a zip)
596
+ """
597
+ file_name = file_name or "downloaded_file"
598
+
599
+ try:
600
+ with tempfile.TemporaryDirectory() as temp_dir_str:
601
+ temp_dir = Path(temp_dir_str)
602
+ temp_file = temp_dir / file_name
603
+
604
+ async with self.async_client.stream("GET", signed_url) as response:
605
+ response.raise_for_status()
606
+
607
+ content_disposition = response.headers.get(
608
+ "content-disposition", ""
609
+ )
610
+ filename = file_name
611
+ if "filename=" in content_disposition:
612
+ filename = content_disposition.split("filename=")[-1].strip('"')
613
+
614
+ if filename != file_name:
615
+ temp_file = temp_dir / filename
616
+
617
+ async with aiofiles.open(temp_file, "wb") as f:
618
+ async for chunk in response.aiter_bytes(chunk_size=8192):
619
+ await f.write(chunk)
620
+
621
+ logger.debug(
622
+ f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
623
+ )
624
+
625
+ if self._is_zip_file(temp_file):
626
+ logger.debug(f"File {temp_file} is a zip file, extracting...")
627
+ extracted_path = self._extract_zip_file(temp_file, temp_dir)
628
+
629
+ final_temp_dir = Path(tempfile.mkdtemp())
630
+ final_path = final_temp_dir / extracted_path.name
631
+
632
+ if extracted_path.is_dir():
633
+ shutil.copytree(extracted_path, final_path)
634
+ else:
635
+ shutil.copy2(extracted_path, final_path)
636
+
637
+ return final_path
638
+ final_temp_dir = Path(tempfile.mkdtemp())
639
+ final_file = final_temp_dir / temp_file.name
640
+ shutil.copy2(temp_file, final_file)
641
+ return final_file
642
+
643
+ except Exception as e:
644
+ raise DataStorageError(f"Failed to download from GCS: {e}") from e
645
+
646
+ def _download_from_gcs(self, signed_url: str, file_name: str | None = None) -> Path:
647
+ """Download file from GCS using signed URL and handle unzipping if needed (sync version).
648
+
649
+ Args:
650
+ signed_url: The signed URL to download from
651
+ file_name: The name of the file to download
652
+ Returns:
653
+ Path to the downloaded file (or unzipped directory if it was a zip)
654
+ """
655
+ file_name = file_name or "downloaded_file"
656
+
657
+ try:
658
+ with tempfile.TemporaryDirectory() as temp_dir_str:
659
+ temp_dir = Path(temp_dir_str)
660
+ temp_file = temp_dir / file_name
661
+
662
+ with requests_lib.get(signed_url, stream=True, timeout=30) as response:
663
+ response.raise_for_status()
664
+
665
+ content_disposition = response.headers.get(
666
+ "content-disposition", ""
667
+ )
668
+ filename = file_name
669
+ if "filename=" in content_disposition:
670
+ filename = content_disposition.split("filename=")[-1].strip('"')
671
+
672
+ if filename != file_name:
673
+ temp_file = temp_dir / filename
674
+
675
+ with open(temp_file, "wb") as f:
676
+ for chunk in response.iter_content(chunk_size=8192):
677
+ f.write(chunk)
678
+
679
+ logger.debug(
680
+ f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
681
+ )
682
+
683
+ if self._is_zip_file(temp_file):
684
+ logger.debug(f"File {temp_file} is a zip file, extracting...")
685
+ extracted_path = self._extract_zip_file(temp_file, temp_dir)
686
+
687
+ final_temp_dir = Path(tempfile.mkdtemp())
688
+ final_path = final_temp_dir / extracted_path.name
689
+
690
+ if extracted_path.is_dir():
691
+ shutil.copytree(extracted_path, final_path)
692
+ else:
693
+ shutil.copy2(extracted_path, final_path)
694
+
695
+ return final_path
696
+ final_temp_dir = Path(tempfile.mkdtemp())
697
+ final_file = final_temp_dir / temp_file.name
698
+ shutil.copy2(temp_file, final_file)
699
+ return final_file
700
+
701
+ except Exception as e:
702
+ raise DataStorageError(f"Failed to download from GCS: {e}") from e
703
+
704
+ def _prepare_single_file_upload(
705
+ self,
706
+ name: str,
707
+ file_path: Path,
708
+ description: str | None,
709
+ file_path_override: str | Path | None,
710
+ dataset_id: UUID | None,
711
+ project_id: UUID | None,
712
+ metadata: dict[str, Any] | None,
713
+ tags: list[str] | None,
714
+ parent_id: UUID | None,
715
+ ) -> tuple[int, DataStorageRequestPayload | None]:
716
+ """Prepare single file for upload, return file size and payload if text content."""
717
+ file_size = file_path.stat().st_size
718
+ logger.debug(
719
+ f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
720
+ )
721
+
722
+ if _should_send_as_text_content(file_path, file_size):
723
+ logger.debug(
724
+ f"Small text file ({file_size:,} bytes) - sending as text content"
725
+ )
726
+ text_content = _extract_text_from_file(file_path)
727
+ if text_content is not None:
728
+ return file_size, DataStorageRequestPayload(
729
+ name=name,
730
+ description=description,
731
+ content=text_content,
732
+ file_path=file_path_override or file_path,
733
+ is_collection=False,
734
+ project_id=project_id,
735
+ metadata=metadata,
736
+ tags=tags,
737
+ dataset_id=dataset_id,
738
+ parent_id=parent_id,
739
+ )
740
+ logger.warning(
741
+ "Could not extract text content, falling back to file upload"
742
+ )
743
+
744
+ return file_size, None
745
+
746
+ def _create_data_storage_entry(
747
+ self, payload: DataStorageRequestPayload
748
+ ) -> DataStorageResponse:
749
+ """Create data storage entry via API (sync version)."""
750
+ response = self.client.post(
751
+ "/v0.1/data-storage/data-entries",
752
+ json=payload.model_dump(mode="json", exclude_none=True),
753
+ )
754
+ response.raise_for_status()
755
+ return DataStorageResponse.model_validate(response.json())
756
+
757
+ async def _acreate_data_storage_entry(
758
+ self, payload: DataStorageRequestPayload
759
+ ) -> DataStorageResponse:
760
+ """Create data storage entry via API (async version)."""
761
+ response = await self.async_client.post(
762
+ "/v0.1/data-storage/data-entries",
763
+ json=payload.model_dump(mode="json", exclude_none=True),
764
+ )
765
+ response.raise_for_status()
766
+ return DataStorageResponse.model_validate(response.json())
767
+
768
+ def _generate_folder_description_from_files(
769
+ self, dir_path: Path, manifest: DirectoryManifest
770
+ ) -> str:
771
+ """Generate folder description by concatenating descriptions of top-level files."""
772
+ descriptions = []
773
+
774
+ # Get top-level files only (not recursive)
775
+ for item in dir_path.iterdir():
776
+ if item.is_file():
777
+ # Try to get description from manifest first
778
+ file_desc = manifest.get_entry_description(item.name)
779
+
780
+ if file_desc:
781
+ descriptions.append(f"{item.name}: {file_desc}")
782
+ else:
783
+ descriptions.append(item.name)
784
+
785
+ if descriptions:
786
+ return f"Directory containing: {', '.join(descriptions)}"
787
+ return f"Directory: {dir_path.name}"
788
+
789
+ def _load_manifest(
790
+ self, dir_path: Path, manifest_filename: str | None
791
+ ) -> DirectoryManifest:
792
+ """Load and parse a manifest file (JSON or YAML) into a structured model."""
793
+ if not manifest_filename:
794
+ return DirectoryManifest()
795
+
796
+ manifest_path = dir_path / manifest_filename
797
+ if not manifest_path.exists():
798
+ logger.error(f"Manifest file not found at {manifest_path}")
799
+ raise DataStorageCreationError(
800
+ f"Manifest file {manifest_filename} not found in directory {dir_path}. Ensure the manifest exists and is correctly named, or do not pass it as an argument."
801
+ )
802
+
803
+ try:
804
+ with open(manifest_path, encoding="utf-8") as f:
805
+ data = {}
806
+ if manifest_filename.lower().endswith(".json"):
807
+ data = json.load(f)
808
+ elif manifest_filename.lower().endswith((".yaml", ".yml")):
809
+ if yaml is None:
810
+ raise ImportError(
811
+ "pyyaml is required to parse .yaml manifest files. "
812
+ "Please install it with `pip install pyyaml`."
813
+ )
814
+ data = yaml.safe_load(f)
815
+ else:
816
+ logger.warning(
817
+ f"Unsupported manifest file extension: {manifest_filename}"
818
+ )
819
+ return DirectoryManifest()
820
+
821
+ return DirectoryManifest.from_dict(data or {})
822
+
823
+ except Exception as e:
824
+ logger.warning(f"Failed to load manifest {manifest_filename}: {e}")
825
+
826
+ return DirectoryManifest()
827
+
828
+ def _upload_data_directory(
829
+ self,
830
+ name: str,
831
+ dir_path: Path,
832
+ description: str | None,
833
+ dir_path_override: str | Path | None = None,
834
+ ignore_patterns: list[str] | None = None,
835
+ ignore_filename: str = ".gitignore",
836
+ project_id: UUID | None = None,
837
+ tags: list[str] | None = None,
838
+ metadata: dict[str, Any] | None = None,
839
+ dataset_id: UUID | None = None,
840
+ parent_id: UUID | None = None,
841
+ ) -> DataStorageResponse:
842
+ """Upload a directory as a single zip file collection.
843
+
844
+ Args:
845
+ name: Name for the directory collection
846
+ dir_path: Path to directory to zip and upload
847
+ description: Description for the collection
848
+ dir_path_override: Optional GCS path for the zip file
849
+ ignore_patterns: List of patterns to ignore when zipping
850
+ ignore_filename: Name of ignore file to read from directory
851
+ project_id: ID of the project this data storage entry belongs to
852
+ tags: List of tags to associate with the data storage entry
853
+ metadata: Optional metadata for the data storage entry
854
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
855
+ parent_id: Optional parent ID for the data storage entry
856
+
857
+ Returns:
858
+ DataStorageResponse for the uploaded zip file
859
+ """
860
+ logger.debug(f"Uploading directory as zip: {dir_path}")
861
+
862
+ with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
863
+ temp_zip_path = Path(temp_file.name)
864
+
865
+ zip_size = _create_directory_zip(
866
+ dir_path, temp_zip_path, ignore_patterns, ignore_filename
867
+ )
868
+
869
+ zip_gcs_path = self._build_zip_path(name, dir_path_override)
870
+ payload = DataStorageRequestPayload(
871
+ name=name,
872
+ description=description,
873
+ file_path=zip_gcs_path,
874
+ is_collection=True,
875
+ project_id=project_id,
876
+ tags=tags,
877
+ metadata=metadata,
878
+ dataset_id=dataset_id,
879
+ parent_id=parent_id,
880
+ )
881
+
882
+ logger.debug(
883
+ f"Creating data storage entry for zip: {payload.model_dump(exclude_none=True)}"
884
+ )
885
+ data_storage_response = self._create_data_storage_entry(payload)
886
+
887
+ for storage_location in data_storage_response.storage_locations:
888
+ if not storage_location.storage_config.signed_url:
889
+ raise DataStorageCreationError(
890
+ "No signed URL returned for zip upload"
891
+ )
892
+
893
+ with tqdm(
894
+ total=zip_size,
895
+ unit="B",
896
+ unit_scale=True,
897
+ unit_divisor=1024,
898
+ desc=f"Uploading {dir_path.name} (zipped)",
899
+ miniters=1,
900
+ mininterval=0.1,
901
+ ) as pbar:
902
+ _upload_file_with_progress(
903
+ storage_location.storage_config.signed_url,
904
+ temp_zip_path,
905
+ pbar,
906
+ zip_size,
907
+ )
908
+
909
+ status_response = self.client.patch(
910
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
911
+ json={"status": "active"},
912
+ )
913
+ status_response.raise_for_status()
914
+
915
+ logger.debug(
916
+ f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
917
+ )
918
+ return DataStorageResponse.model_validate(status_response.json())
919
+
920
+ async def _aupload_data_directory(
921
+ self,
922
+ name: str,
923
+ dir_path: Path,
924
+ description: str | None,
925
+ dir_path_override: str | Path | None = None,
926
+ ignore_patterns: list[str] | None = None,
927
+ ignore_filename: str = ".gitignore",
928
+ project_id: UUID | None = None,
929
+ tags: list[str] | None = None,
930
+ metadata: dict[str, Any] | None = None,
931
+ dataset_id: UUID | None = None,
932
+ parent_id: UUID | None = None,
933
+ ) -> DataStorageResponse:
934
+ """Asynchronously upload a directory as a single zip file.
935
+
936
+ Args:
937
+ name: Name for the directory collection
938
+ dir_path: Path to directory to zip and upload
939
+ description: Description for the collection
940
+ dir_path_override: Optional GCS path for the zip file
941
+ ignore_patterns: List of patterns to ignore when zipping
942
+ ignore_filename: Name of ignore file to read from directory
943
+ project_id: ID of the project this data storage entry belongs to
944
+ tags: List of tags to associate with the data storage entry
945
+ metadata: Optional metadata for the data storage entry
946
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
947
+ parent_id: Optional parent ID for the data storage entry
948
+
949
+ Returns:
950
+ DataStorageResponse for the uploaded zip file
951
+ """
952
+ logger.debug(f"Async uploading directory as zip: {dir_path}")
953
+
954
+ with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
955
+ temp_zip_path = Path(temp_file.name)
956
+
957
+ zip_size = _create_directory_zip(
958
+ dir_path, temp_zip_path, ignore_patterns, ignore_filename
959
+ )
960
+
961
+ zip_gcs_path = self._build_zip_path(name, dir_path_override)
962
+ payload = DataStorageRequestPayload(
963
+ name=name,
964
+ description=description,
965
+ file_path=zip_gcs_path,
966
+ is_collection=True,
967
+ project_id=project_id,
968
+ tags=tags,
969
+ metadata=metadata,
970
+ dataset_id=dataset_id,
971
+ parent_id=parent_id,
972
+ )
973
+
974
+ data_storage_response = await self._acreate_data_storage_entry(payload)
975
+
976
+ for storage_location in data_storage_response.storage_locations:
977
+ if not storage_location.storage_config.signed_url:
978
+ raise DataStorageCreationError(
979
+ "No signed URL returned for zip upload"
980
+ )
981
+
982
+ with tqdm(
983
+ total=zip_size,
984
+ unit="B",
985
+ unit_scale=True,
986
+ unit_divisor=1024,
987
+ desc=f"Uploading {dir_path.name} (zipped)",
988
+ miniters=1,
989
+ mininterval=0.1,
990
+ ) as pbar:
991
+ await _aupload_file_with_progress(
992
+ storage_location.storage_config.signed_url,
993
+ temp_zip_path,
994
+ pbar,
995
+ zip_size,
996
+ )
997
+
998
+ status_response = await self.async_client.patch(
999
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1000
+ json={"status": "active"},
1001
+ )
1002
+ status_response.raise_for_status()
1003
+
1004
+ logger.debug(
1005
+ f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
1006
+ )
1007
+ return DataStorageResponse.model_validate(status_response.json())
1008
+
1009
+ def _upload_data_single_file(
1010
+ self,
1011
+ name: str,
1012
+ file_path: Path,
1013
+ description: str | None,
1014
+ file_path_override: str | Path | None = None,
1015
+ project_id: UUID | None = None,
1016
+ metadata: dict[str, Any] | None = None,
1017
+ tags: list[str] | None = None,
1018
+ dataset_id: UUID | None = None,
1019
+ parent_id: UUID | None = None,
1020
+ ) -> DataStorageResponse:
1021
+ """Upload a single file."""
1022
+ file_size = file_path.stat().st_size
1023
+ logger.debug(
1024
+ f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
1025
+ )
1026
+
1027
+ if _should_send_as_text_content(file_path, file_size):
1028
+ logger.debug(
1029
+ f"Small text file ({file_size:,} bytes) - sending as text content"
1030
+ )
1031
+
1032
+ text_content = _extract_text_from_file(file_path)
1033
+ if text_content is not None:
1034
+ payload = DataStorageRequestPayload(
1035
+ name=name,
1036
+ description=description,
1037
+ content=text_content,
1038
+ file_path=file_path_override or file_path,
1039
+ is_collection=False,
1040
+ project_id=project_id,
1041
+ metadata=metadata,
1042
+ tags=tags,
1043
+ dataset_id=dataset_id,
1044
+ parent_id=parent_id,
1045
+ )
1046
+
1047
+ logger.debug("Sending file as text content")
1048
+ return self._create_data_storage_entry(payload)
1049
+ logger.warning(
1050
+ "Could not extract text content, falling back to file upload"
1051
+ )
1052
+
1053
+ logger.debug(
1054
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1055
+ )
1056
+ payload = DataStorageRequestPayload(
1057
+ name=name,
1058
+ description=description,
1059
+ file_path=file_path_override or file_path,
1060
+ is_collection=False,
1061
+ project_id=project_id,
1062
+ metadata=metadata,
1063
+ tags=tags,
1064
+ dataset_id=dataset_id,
1065
+ parent_id=parent_id,
1066
+ )
1067
+
1068
+ logger.debug(
1069
+ f"Requesting signed URL with payload: {payload.model_dump(exclude_none=True)}"
1070
+ )
1071
+
1072
+ data_storage_response = self._create_data_storage_entry(payload)
1073
+
1074
+ for storage_location in data_storage_response.storage_locations:
1075
+ if not storage_location.storage_config.signed_url:
1076
+ raise DataStorageCreationError("No signed URL returned from server")
1077
+
1078
+ with tqdm(
1079
+ total=file_size,
1080
+ unit="B",
1081
+ unit_scale=True,
1082
+ unit_divisor=1024,
1083
+ desc=f"Uploading {file_path.name}",
1084
+ miniters=1,
1085
+ mininterval=0.1,
1086
+ ) as pbar:
1087
+ try:
1088
+ _upload_file_with_progress(
1089
+ storage_location.storage_config.signed_url,
1090
+ file_path,
1091
+ pbar,
1092
+ file_size,
1093
+ )
1094
+ logger.debug("File upload to signed URL completed successfully")
1095
+ except Exception as e:
1096
+ logger.error(f"Failed to upload file to signed URL: {e}")
1097
+ raise
1098
+
1099
+ logger.debug("Updating data storage status to active")
1100
+ status_response = self.client.patch(
1101
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1102
+ json={"status": "active"},
1103
+ )
1104
+ status_response.raise_for_status()
1105
+ logger.debug("Data storage status updated successfully")
1106
+
1107
+ return DataStorageResponse.model_validate(status_response.json())
1108
+
1109
+ async def _aupload_data_single_file(
1110
+ self,
1111
+ name: str,
1112
+ file_path: Path,
1113
+ description: str | None,
1114
+ file_path_override: str | Path | None = None,
1115
+ dataset_id: UUID | None = None,
1116
+ project_id: UUID | None = None,
1117
+ metadata: dict[str, Any] | None = None,
1118
+ tags: list[str] | None = None,
1119
+ parent_id: UUID | None = None,
1120
+ ) -> DataStorageResponse:
1121
+ """Asynchronously upload a single file."""
1122
+ file_size, text_payload = self._prepare_single_file_upload(
1123
+ name=name,
1124
+ file_path=file_path,
1125
+ description=description,
1126
+ file_path_override=file_path_override,
1127
+ dataset_id=dataset_id,
1128
+ project_id=project_id,
1129
+ metadata=metadata,
1130
+ tags=tags,
1131
+ parent_id=parent_id,
1132
+ )
1133
+
1134
+ if text_payload:
1135
+ logger.debug("Sending file as text content")
1136
+ text_payload.dataset_id = dataset_id
1137
+ return await self._acreate_data_storage_entry(text_payload)
1138
+
1139
+ logger.debug(
1140
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1141
+ )
1142
+ payload = DataStorageRequestPayload(
1143
+ name=name,
1144
+ description=description,
1145
+ file_path=file_path_override or file_path,
1146
+ is_collection=False,
1147
+ dataset_id=dataset_id,
1148
+ project_id=project_id,
1149
+ metadata=metadata,
1150
+ tags=tags,
1151
+ parent_id=parent_id,
1152
+ )
1153
+
1154
+ data_storage_response = await self._acreate_data_storage_entry(payload)
1155
+
1156
+ for location in data_storage_response.storage_locations:
1157
+ if not location.storage_config.signed_url:
1158
+ raise DataStorageCreationError(
1159
+ f"No signed URL returned from server for location: {location.id}"
1160
+ )
1161
+
1162
+ with tqdm(
1163
+ total=file_size,
1164
+ unit="B",
1165
+ unit_scale=True,
1166
+ unit_divisor=1024,
1167
+ desc=f"Uploading {file_path.name}",
1168
+ miniters=1,
1169
+ mininterval=0.1,
1170
+ leave=False,
1171
+ ) as pbar:
1172
+ await _aupload_file_with_progress(
1173
+ location.storage_config.signed_url, file_path, pbar, file_size
1174
+ )
1175
+
1176
+ status_response = await self.async_client.patch(
1177
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1178
+ json={"status": "active"},
1179
+ )
1180
+ status_response.raise_for_status()
1181
+
1182
+ return DataStorageResponse.model_validate(status_response.json())
1183
+
1184
+ def _upload_data_single_file_with_parent(
1185
+ self,
1186
+ name: str,
1187
+ file_path: Path,
1188
+ description: str | None,
1189
+ file_path_override: str | None,
1190
+ parent_id: UUID | None,
1191
+ dataset_id: UUID | None = None,
1192
+ project_id: UUID | None = None,
1193
+ metadata: dict[str, Any] | None = None,
1194
+ tags: list[str] | None = None,
1195
+ ) -> DataStorageResponse:
1196
+ """Upload a single file with a parent ID (sync version)."""
1197
+ file_size, text_payload = self._prepare_single_file_upload(
1198
+ name=name,
1199
+ file_path=file_path,
1200
+ description=description,
1201
+ file_path_override=file_path_override,
1202
+ dataset_id=dataset_id,
1203
+ project_id=project_id,
1204
+ metadata=metadata,
1205
+ tags=tags,
1206
+ parent_id=parent_id,
1207
+ )
1208
+
1209
+ if text_payload:
1210
+ logger.debug("Sending file as text content with parent_id")
1211
+ text_payload.parent_id = parent_id
1212
+ text_payload.dataset_id = dataset_id
1213
+ text_payload.project_id = project_id
1214
+ return self._create_data_storage_entry(text_payload)
1215
+
1216
+ logger.debug(
1217
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1218
+ )
1219
+ payload = DataStorageRequestPayload(
1220
+ name=name,
1221
+ description=description,
1222
+ file_path=file_path_override or file_path,
1223
+ is_collection=False,
1224
+ parent_id=parent_id,
1225
+ dataset_id=dataset_id,
1226
+ project_id=project_id,
1227
+ metadata=metadata,
1228
+ tags=tags,
1229
+ )
1230
+ data_storage_response = self._create_data_storage_entry(payload)
1231
+
1232
+ for location in data_storage_response.storage_locations:
1233
+ if not location.storage_config.signed_url:
1234
+ raise DataStorageCreationError("No signed URL returned from server")
1235
+
1236
+ with tqdm(
1237
+ total=file_size,
1238
+ unit="B",
1239
+ unit_scale=True,
1240
+ unit_divisor=1024,
1241
+ desc=f"Uploading {file_path.name}",
1242
+ miniters=1,
1243
+ mininterval=0.1,
1244
+ leave=False,
1245
+ ) as pbar:
1246
+ _upload_file_with_progress(
1247
+ location.storage_config.signed_url, file_path, pbar, file_size
1248
+ )
1249
+
1250
+ status_response = self.client.patch(
1251
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1252
+ json={"status": "active"},
1253
+ )
1254
+ status_response.raise_for_status()
1255
+
1256
+ return DataStorageResponse.model_validate(status_response.json())
1257
+
1258
+ def _process_file_item(
1259
+ self,
1260
+ item: Path,
1261
+ dir_manifest: DirectoryManifest,
1262
+ current_parent_id: UUID,
1263
+ dataset_id: UUID | None = None,
1264
+ project_id: UUID | None = None,
1265
+ metadata: dict[str, Any] | None = None,
1266
+ tags: list[str] | None = None,
1267
+ ) -> DataStorageResponse | None:
1268
+ """Process a single file item for upload."""
1269
+ try:
1270
+ manifest_desc = dir_manifest.get_entry_description(item.name)
1271
+ file_description = manifest_desc or f"File: {item.name}"
1272
+
1273
+ logger.debug(
1274
+ f"Processing file {item.name} with description: '{file_description}'"
1275
+ )
1276
+
1277
+ return self._upload_data_single_file_with_parent(
1278
+ name=item.name,
1279
+ file_path=item,
1280
+ description=file_description,
1281
+ file_path_override=None,
1282
+ parent_id=current_parent_id,
1283
+ dataset_id=dataset_id,
1284
+ project_id=project_id,
1285
+ metadata=metadata,
1286
+ tags=tags,
1287
+ )
1288
+ except Exception as e:
1289
+ logger.error(f"Failed to upload file {item}: {e}")
1290
+ return None
1291
+
1292
+ def _upload_directory_hierarchically(
1293
+ self,
1294
+ name: str,
1295
+ dir_path: Path,
1296
+ description: str | None = None,
1297
+ manifest_filename: str | None = None,
1298
+ parent_id: UUID | None = None,
1299
+ ignore_patterns: list[str] | None = None,
1300
+ ignore_filename: str = ".gitignore",
1301
+ base_dir: Path | None = None,
1302
+ dir_manifest: DirectoryManifest | None = None,
1303
+ dataset_id: UUID | None = None,
1304
+ project_id: UUID | None = None,
1305
+ metadata: dict[str, Any] | None = None,
1306
+ tags: list[str] | None = None,
1307
+ ) -> list[DataStorageResponse]:
1308
+ """Upload a directory with single dataset and individual file storage entries."""
1309
+ responses = []
1310
+ if parent_id is None:
1311
+ base_dir = dir_path
1312
+ all_ignore_patterns = _collect_ignore_patterns(
1313
+ base_dir, ignore_patterns, ignore_filename
1314
+ )
1315
+
1316
+ payload = DataStorageRequestPayload(
1317
+ name=name,
1318
+ description=description,
1319
+ parent_id=None,
1320
+ dataset_id=None,
1321
+ is_collection=False,
1322
+ project_id=project_id,
1323
+ metadata=metadata,
1324
+ tags=tags,
1325
+ )
1326
+
1327
+ dir_response = self._create_data_storage_entry(payload)
1328
+ responses.append(dir_response)
1329
+ current_parent_id = dir_response.data_storage.id
1330
+ current_dataset_id = dir_response.data_storage.dataset_id
1331
+
1332
+ dir_manifest = self._load_directory_manifest(
1333
+ manifest_filename, parent_id, dir_path
1334
+ )
1335
+ else:
1336
+ all_ignore_patterns = ignore_patterns or []
1337
+ current_parent_id = parent_id
1338
+ current_dataset_id = dataset_id
1339
+
1340
+ for item in dir_path.iterdir():
1341
+ if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
1342
+ continue
1343
+
1344
+ if item.is_dir():
1345
+ subdir_manifest = DirectoryManifest()
1346
+ if dir_manifest:
1347
+ entry = dir_manifest.entries.get(item.name)
1348
+ if isinstance(entry, DirectoryManifest):
1349
+ subdir_manifest = entry
1350
+ elif isinstance(entry, ManifestEntry):
1351
+ # Convert single entry to manifest
1352
+ subdir_manifest = DirectoryManifest(entries={item.name: entry})
1353
+
1354
+ subdir_description = subdir_manifest.get_entry_description(item.name)
1355
+ if not subdir_description:
1356
+ subdir_description = self._generate_folder_description_from_files(
1357
+ item, subdir_manifest
1358
+ )
1359
+
1360
+ subdir_payload = DataStorageRequestPayload(
1361
+ name=item.name,
1362
+ description=subdir_description,
1363
+ parent_id=current_parent_id,
1364
+ dataset_id=current_dataset_id,
1365
+ is_collection=False,
1366
+ project_id=project_id,
1367
+ metadata=metadata,
1368
+ tags=tags,
1369
+ )
1370
+ subdir_response = self._create_data_storage_entry(subdir_payload)
1371
+ responses.append(subdir_response)
1372
+
1373
+ subdir_responses = self._upload_directory_hierarchically(
1374
+ name=item.name,
1375
+ dir_path=item,
1376
+ description=None,
1377
+ manifest_filename=None,
1378
+ parent_id=subdir_response.data_storage.id,
1379
+ ignore_patterns=all_ignore_patterns,
1380
+ ignore_filename=ignore_filename,
1381
+ base_dir=base_dir,
1382
+ dir_manifest=subdir_manifest,
1383
+ dataset_id=current_dataset_id,
1384
+ project_id=project_id,
1385
+ metadata=metadata,
1386
+ tags=tags,
1387
+ )
1388
+ responses.extend(subdir_responses)
1389
+ elif item.is_file():
1390
+ file_response = self._process_file_item(
1391
+ item=item,
1392
+ dir_manifest=dir_manifest or DirectoryManifest(),
1393
+ current_parent_id=current_parent_id,
1394
+ dataset_id=current_dataset_id,
1395
+ project_id=project_id,
1396
+ metadata=metadata,
1397
+ tags=tags,
1398
+ )
1399
+ if file_response:
1400
+ responses.append(file_response)
1401
+
1402
+ return responses
1403
+
1404
+ def _load_directory_manifest(
1405
+ self,
1406
+ manifest_filename: str | None,
1407
+ parent_id: UUID | None,
1408
+ dir_path: Path,
1409
+ ) -> DirectoryManifest:
1410
+ """Load directory manifest if available."""
1411
+ if manifest_filename and not parent_id:
1412
+ manifest_data = self._load_manifest(Path.cwd(), manifest_filename)
1413
+ dir_name = dir_path.name
1414
+ logger.debug(
1415
+ f"Loaded manifest entries: {list(manifest_data.entries.keys())}"
1416
+ )
1417
+ logger.debug(
1418
+ f"Looking for manifest entry with directory name: '{dir_name}'"
1419
+ )
1420
+
1421
+ entry = manifest_data.entries.get(dir_name)
1422
+ if isinstance(entry, DirectoryManifest):
1423
+ return entry
1424
+ if isinstance(entry, ManifestEntry):
1425
+ return DirectoryManifest(entries={dir_name: entry})
1426
+ logger.debug(
1427
+ f"No manifest entry found for '{dir_name}', available keys: {list(manifest_data.entries.keys())}"
1428
+ )
1429
+ return DirectoryManifest()
1430
+ return DirectoryManifest()
1431
+
1432
+ async def _aupload_data_single_file_with_parent(
1433
+ self,
1434
+ name: str,
1435
+ file_path: Path,
1436
+ description: str | None,
1437
+ file_path_override: str | None,
1438
+ parent_id: UUID | None,
1439
+ dataset_id: UUID | None = None,
1440
+ project_id: UUID | None = None,
1441
+ metadata: dict[str, Any] | None = None,
1442
+ tags: list[str] | None = None,
1443
+ ) -> DataStorageResponse:
1444
+ """Asynchronously upload a single file with a parent ID."""
1445
+ file_size, text_payload = self._prepare_single_file_upload(
1446
+ name=name,
1447
+ file_path=file_path,
1448
+ description=description,
1449
+ file_path_override=file_path_override,
1450
+ dataset_id=dataset_id,
1451
+ project_id=project_id,
1452
+ metadata=metadata,
1453
+ tags=tags,
1454
+ parent_id=parent_id,
1455
+ )
1456
+
1457
+ if text_payload:
1458
+ logger.debug("Sending file as text content with parent_id")
1459
+ text_payload.parent_id = parent_id
1460
+ text_payload.dataset_id = dataset_id
1461
+ text_payload.project_id = project_id
1462
+ return await self._acreate_data_storage_entry(text_payload)
1463
+
1464
+ logger.debug(
1465
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1466
+ )
1467
+ payload = DataStorageRequestPayload(
1468
+ name=name,
1469
+ description=description,
1470
+ file_path=file_path_override or file_path,
1471
+ is_collection=False,
1472
+ parent_id=parent_id,
1473
+ dataset_id=dataset_id,
1474
+ project_id=project_id,
1475
+ metadata=metadata,
1476
+ tags=tags,
1477
+ )
1478
+ data_storage_response = await self._acreate_data_storage_entry(payload)
1479
+
1480
+ storage_location = data_storage_response.storage_locations[0]
1481
+
1482
+ if not storage_location.storage_config.signed_url:
1483
+ raise DataStorageCreationError("No signed URL returned from server")
1484
+
1485
+ with tqdm(
1486
+ total=file_size,
1487
+ unit="B",
1488
+ unit_scale=True,
1489
+ unit_divisor=1024,
1490
+ desc=f"Uploading {file_path.name}",
1491
+ miniters=1,
1492
+ mininterval=0.1,
1493
+ ) as pbar:
1494
+ await _aupload_file_with_progress(
1495
+ storage_location.storage_config.signed_url, file_path, pbar, file_size
1496
+ )
1497
+
1498
+ status_response = await self.async_client.patch(
1499
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1500
+ json={"status": "active"},
1501
+ )
1502
+ status_response.raise_for_status()
1503
+
1504
+ return DataStorageResponse.model_validate(status_response.json())
1505
+
1506
+ async def _aprocess_file_item(
1507
+ self,
1508
+ item: Path,
1509
+ dir_manifest: DirectoryManifest,
1510
+ current_parent_id: UUID,
1511
+ dataset_id: UUID | None = None,
1512
+ project_id: UUID | None = None,
1513
+ ) -> DataStorageResponse | None:
1514
+ """Asynchronously process a single file item for upload."""
1515
+ try:
1516
+ manifest_desc = dir_manifest.get_entry_description(item.name)
1517
+ file_description = manifest_desc or f"File: {item.name}"
1518
+
1519
+ logger.debug(
1520
+ f"Processing file {item.name} with description: '{file_description}'"
1521
+ )
1522
+
1523
+ return await self._aupload_data_single_file_with_parent(
1524
+ name=item.name,
1525
+ file_path=item,
1526
+ description=file_description,
1527
+ file_path_override=None,
1528
+ parent_id=current_parent_id,
1529
+ dataset_id=dataset_id,
1530
+ project_id=project_id,
1531
+ )
1532
+ except Exception as e:
1533
+ logger.error(f"Failed to upload file {item}: {e}")
1534
+ return None
1535
+
1536
+ async def _aupload_directory_hierarchically(
1537
+ self,
1538
+ name: str,
1539
+ dir_path: Path,
1540
+ description: str | None = None,
1541
+ manifest_filename: str | None = None,
1542
+ parent_id: UUID | None = None,
1543
+ ignore_patterns: list[str] | None = None,
1544
+ ignore_filename: str = ".gitignore",
1545
+ base_dir: Path | None = None,
1546
+ dir_manifest: DirectoryManifest | None = None,
1547
+ dataset_id: UUID | None = None,
1548
+ project_id: UUID | None = None,
1549
+ metadata: dict[str, Any] | None = None,
1550
+ tags: list[str] | None = None,
1551
+ ) -> list[DataStorageResponse]:
1552
+ """Upload a directory with single dataset and individual file storage entries (async)."""
1553
+ responses = []
1554
+
1555
+ if parent_id is None:
1556
+ base_dir = dir_path
1557
+ all_ignore_patterns = _collect_ignore_patterns(
1558
+ base_dir, ignore_patterns, ignore_filename
1559
+ )
1560
+
1561
+ payload = DataStorageRequestPayload(
1562
+ name=name,
1563
+ description=description,
1564
+ parent_id=None,
1565
+ dataset_id=None,
1566
+ is_collection=False,
1567
+ project_id=project_id,
1568
+ metadata=metadata,
1569
+ tags=tags,
1570
+ )
1571
+
1572
+ dir_response = await self._acreate_data_storage_entry(payload)
1573
+ responses.append(dir_response)
1574
+ current_parent_id = dir_response.data_storage.id
1575
+ current_dataset_id = dir_response.data_storage.dataset_id
1576
+
1577
+ dir_manifest = self._load_directory_manifest(
1578
+ manifest_filename, parent_id, dir_path
1579
+ )
1580
+ else:
1581
+ all_ignore_patterns = ignore_patterns or []
1582
+ current_parent_id = parent_id
1583
+ current_dataset_id = dataset_id
1584
+
1585
+ for item in dir_path.iterdir():
1586
+ if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
1587
+ continue
1588
+
1589
+ if item.is_dir():
1590
+ subdir_manifest = DirectoryManifest()
1591
+ if dir_manifest:
1592
+ entry = dir_manifest.entries.get(item.name)
1593
+ if isinstance(entry, DirectoryManifest):
1594
+ subdir_manifest = entry
1595
+ elif isinstance(entry, ManifestEntry):
1596
+ subdir_manifest = DirectoryManifest(entries={item.name: entry})
1597
+
1598
+ subdir_description = subdir_manifest.get_entry_description(item.name)
1599
+ if not subdir_description:
1600
+ subdir_description = self._generate_folder_description_from_files(
1601
+ item, subdir_manifest
1602
+ )
1603
+
1604
+ subdir_payload = DataStorageRequestPayload(
1605
+ name=item.name,
1606
+ description=subdir_description,
1607
+ parent_id=current_parent_id,
1608
+ dataset_id=current_dataset_id,
1609
+ is_collection=False,
1610
+ project_id=project_id,
1611
+ metadata=metadata,
1612
+ tags=tags,
1613
+ )
1614
+ subdir_response = await self._acreate_data_storage_entry(subdir_payload)
1615
+ responses.append(subdir_response)
1616
+
1617
+ subdir_responses = await self._aupload_directory_hierarchically(
1618
+ name=item.name,
1619
+ dir_path=item,
1620
+ description=None,
1621
+ manifest_filename=None,
1622
+ parent_id=subdir_response.data_storage.id,
1623
+ ignore_patterns=all_ignore_patterns,
1624
+ ignore_filename=ignore_filename,
1625
+ base_dir=base_dir,
1626
+ dir_manifest=subdir_manifest,
1627
+ dataset_id=current_dataset_id,
1628
+ project_id=project_id,
1629
+ metadata=metadata,
1630
+ tags=tags,
1631
+ )
1632
+ responses.extend(subdir_responses)
1633
+ elif item.is_file():
1634
+ file_response = await self._aprocess_file_item(
1635
+ item,
1636
+ dir_manifest or DirectoryManifest(),
1637
+ current_parent_id,
1638
+ current_dataset_id,
1639
+ )
1640
+ if file_response:
1641
+ responses.append(file_response)
1642
+
1643
+ return responses
1644
+
1645
+ @property
1646
+ def client(self) -> Client:
1647
+ raise NotImplementedError("client property must be implemented by subclass")
1648
+
1649
+ @property
1650
+ def async_client(self) -> AsyncClient:
1651
+ raise NotImplementedError(
1652
+ "async_client property must be implemented by subclass"
1653
+ )
1654
+
1655
+ @retry(
1656
+ stop=stop_after_attempt(3),
1657
+ wait=wait_exponential(multiplier=1, max=10),
1658
+ retry=retry_if_connection_error,
1659
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1660
+ )
1661
+ def store_text_content(
1662
+ self,
1663
+ name: str,
1664
+ content: str,
1665
+ description: str | None = None,
1666
+ file_path: str | None = None,
1667
+ project_id: UUID | None = None,
1668
+ metadata: dict[str, Any] | None = None,
1669
+ tags: list[str] | None = None,
1670
+ dataset_id: UUID | None = None,
1671
+ parent_id: UUID | None = None,
1672
+ ) -> DataStorageResponse:
1673
+ """Store content as a string in the data storage system.
1674
+
1675
+ Args:
1676
+ name: Name of the data storage entry
1677
+ content: Content to store as a string
1678
+ description: Optional description of the data storage entry
1679
+ file_path: Optional path for the data storage entry
1680
+ project_id: ID of the project this data storage entry belongs to
1681
+ metadata: Optional metadata for the data storage entry
1682
+ tags: Optional tags for the data storage entry
1683
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1684
+ parent_id: Optional parent ID for the data storage entry
1685
+
1686
+ Returns:
1687
+ DataStorageResponse: A Pydantic model containing:
1688
+ - data_storage: DataStorageEntry with fields:
1689
+ - id - Unique identifier for the data storage entry
1690
+ - name - Name of the data storage entry
1691
+ - description - Description of the data storage entry
1692
+ - content - Content of the data storage entry
1693
+ - embedding - Embedding vector for the content
1694
+ - is_collection - Whether this entry is a collection
1695
+ - tags - List of tags associated with the entry
1696
+ - parent_id - ID of the parent entry for hierarchical storage
1697
+ - project_id - ID of the project this entry belongs to
1698
+ - dataset_id - ID of the dataset this entry belongs to
1699
+ - file_path - Filepath in the storage system where this entry is located
1700
+ - bigquery_schema - Target BigQuery schema for the entry
1701
+ - user_id - ID of the user who created this entry
1702
+ - created_at - Timestamp when the entry was created
1703
+ - modified_at - Timestamp when the entry was last updated
1704
+ - storage_locations with each location containing:
1705
+ - id - Unique identifier for the storage location
1706
+ - data_storage_id - ID of the associated data storage entry
1707
+ - storage_config pydantic model with fields:
1708
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
1709
+ - content_type - Type of content stored
1710
+ - content_schema - Content schema
1711
+ - metadata - Location metadata
1712
+ - location - Location path or identifier
1713
+ - signed_url - Signed URL for uploading/downloading
1714
+
1715
+ Raises:
1716
+ DataStorageCreationError: If there's an error creating the data storage entry
1717
+ """
1718
+ try:
1719
+ payload = DataStorageRequestPayload(
1720
+ name=name,
1721
+ content=content,
1722
+ description=description,
1723
+ file_path=file_path,
1724
+ project_id=project_id,
1725
+ metadata=metadata,
1726
+ tags=tags,
1727
+ dataset_id=dataset_id,
1728
+ parent_id=parent_id,
1729
+ )
1730
+ return self._create_data_storage_entry(payload)
1731
+ except HTTPStatusError as e:
1732
+ self._handle_http_errors(e, "creating")
1733
+ except Exception as e:
1734
+ raise DataStorageCreationError(
1735
+ f"An unexpected error occurred: {e!r}"
1736
+ ) from e
1737
+
1738
+ @retry(
1739
+ stop=stop_after_attempt(3),
1740
+ wait=wait_exponential(multiplier=1, max=10),
1741
+ retry=retry_if_connection_error,
1742
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1743
+ )
1744
+ async def astore_text_content(
1745
+ self,
1746
+ name: str,
1747
+ content: str,
1748
+ description: str | None = None,
1749
+ file_path: str | None = None,
1750
+ dataset_id: UUID | None = None,
1751
+ project_id: UUID | None = None,
1752
+ metadata: dict[str, Any] | None = None,
1753
+ tags: list[str] | None = None,
1754
+ parent_id: UUID | None = None,
1755
+ ) -> DataStorageResponse:
1756
+ """Asynchronously store content as a string in the data storage system.
1757
+
1758
+ Args:
1759
+ name: Name of the data storage entry
1760
+ content: Content to store as a string
1761
+ description: Optional description of the data storage entry
1762
+ file_path: Optional path for the data storage entry
1763
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1764
+ project_id: ID of the project this data storage entry belongs to
1765
+ metadata: Optional metadata for the data storage entry
1766
+ tags: Optional tags for the data storage entry
1767
+ parent_id: Optional parent ID for the data storage entry
1768
+
1769
+ Returns:
1770
+ DataStorageResponse: A Pydantic model containing:
1771
+ - data_storage: DataStorageEntry with fields:
1772
+ - id - Unique identifier for the data storage entry
1773
+ - name - Name of the data storage entry
1774
+ - description - Description of the data storage entry
1775
+ - content - Content of the data storage entry
1776
+ - embedding - Embedding vector for the content
1777
+ - is_collection - Whether this entry is a collection
1778
+ - tags - List of tags associated with the entry
1779
+ - parent_id - ID of the parent entry for hierarchical storage
1780
+ - project_id - ID of the project this entry belongs to
1781
+ - dataset_id - ID of the dataset this entry belongs to
1782
+ - file_path - Filepath in the storage system where this entry is located
1783
+ - bigquery_schema - Target BigQuery schema for the entry
1784
+ - user_id - ID of the user who created this entry
1785
+ - created_at - Timestamp when the entry was created
1786
+ - modified_at - Timestamp when the entry was last updated
1787
+ - storage_locations with each location containing:
1788
+ - id - Unique identifier for the storage location
1789
+ - data_storage_id - ID of the associated data storage entry
1790
+ - storage_config pydantic model with fields:
1791
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
1792
+ - content_type - Type of content stored
1793
+ - content_schema - Content schema
1794
+ - metadata - Location metadata
1795
+ - location - Location path or identifier
1796
+ - signed_url - Signed URL for uploading/downloading
1797
+
1798
+ Raises:
1799
+ DataStorageCreationError: If there's an error creating the data storage entry
1800
+ """
1801
+ try:
1802
+ payload = DataStorageRequestPayload(
1803
+ name=name,
1804
+ content=content,
1805
+ description=description,
1806
+ file_path=file_path,
1807
+ dataset_id=dataset_id,
1808
+ project_id=project_id,
1809
+ metadata=metadata,
1810
+ tags=tags,
1811
+ parent_id=parent_id,
1812
+ )
1813
+ return await self._acreate_data_storage_entry(payload)
1814
+ except HTTPStatusError as e:
1815
+ self._handle_http_errors(e, "creating")
1816
+ except Exception as e:
1817
+ raise DataStorageCreationError(
1818
+ f"An unexpected error occurred: {e!r}"
1819
+ ) from e
1820
+
1821
+ @retry(
1822
+ stop=stop_after_attempt(3),
1823
+ wait=wait_exponential(multiplier=1, max=10),
1824
+ retry=retry_if_connection_error,
1825
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1826
+ )
1827
+ async def astore_link(
1828
+ self,
1829
+ name: str,
1830
+ url: HttpUrl,
1831
+ description: str,
1832
+ instructions: str,
1833
+ api_key: str | None = None,
1834
+ metadata: dict[str, Any] | None = None,
1835
+ dataset_id: UUID | None = None,
1836
+ project_id: UUID | None = None,
1837
+ tags: list[str] | None = None,
1838
+ parent_id: UUID | None = None,
1839
+ ) -> DataStorageResponse:
1840
+ """Asynchronously store a link/URL in the data storage system.
1841
+
1842
+ Args:
1843
+ name: Name of the link entry
1844
+ url: The URL/link to store
1845
+ description: Searchable details of the link
1846
+ instructions: Instructions for how to consume the link or api
1847
+ api_key: Any authentication key to access the api. If this is included, you should also include
1848
+ details of how the key should be consumed in the instructions.
1849
+ metadata: Any additional metadata about the link
1850
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1851
+ project_id: ID of the project this data storage entry belongs to
1852
+ tags: Optional tags for the data storage entry
1853
+ parent_id: Optional parent ID for the data storage entry
1854
+
1855
+ Returns:
1856
+ DataStorageResponse containing the created link storage entry
1857
+
1858
+ Raises:
1859
+ DataStorageCreationError: If there's an error creating the link storage entry
1860
+ """
1861
+ try:
1862
+ link_metadata = metadata.copy() if metadata else {}
1863
+ link_metadata["instructions"] = instructions
1864
+ if api_key:
1865
+ link_metadata["api_key"] = api_key
1866
+
1867
+ existing_location = DataStorageLocationPayload(
1868
+ storage_type=DataStorageType.LINK,
1869
+ content_type=DataContentType.TEXT,
1870
+ location=str(url),
1871
+ metadata=link_metadata or None,
1872
+ )
1873
+
1874
+ payload = DataStorageRequestPayload(
1875
+ name=name,
1876
+ content=str(url),
1877
+ description=description,
1878
+ dataset_id=dataset_id,
1879
+ project_id=project_id,
1880
+ existing_location=existing_location,
1881
+ tags=tags,
1882
+ metadata=metadata,
1883
+ parent_id=parent_id,
1884
+ )
1885
+ return await self._acreate_data_storage_entry(payload)
1886
+ except HTTPStatusError as e:
1887
+ self._handle_http_errors(e, "creating")
1888
+ except Exception as e:
1889
+ raise DataStorageCreationError(
1890
+ f"An unexpected error occurred: {e!r}"
1891
+ ) from e
1892
+
1893
+ def store_link(
1894
+ self,
1895
+ name: str,
1896
+ url: HttpUrl,
1897
+ description: str,
1898
+ instructions: str,
1899
+ api_key: str | None = None,
1900
+ metadata: dict[str, Any] | None = None,
1901
+ dataset_id: UUID | None = None,
1902
+ project_id: UUID | None = None,
1903
+ tags: list[str] | None = None,
1904
+ parent_id: UUID | None = None,
1905
+ ) -> DataStorageResponse:
1906
+ """Store a link/URL in the data storage system.
1907
+
1908
+ Args:
1909
+ name: Name of the link entry
1910
+ url: The URL/link to store
1911
+ description: Searchable details of the link
1912
+ instructions: Instructions for how to consume the link or api
1913
+ api_key: Any authentication key to access the api. If this is included, you should also include
1914
+ details of how the key should be consumed in the instructions.
1915
+ metadata: Any additional metadata about the link
1916
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1917
+ project_id: ID of the project this data storage entry belongs to
1918
+ tags: Optional tags for the data storage entry
1919
+ parent_id: Optional parent ID for the data storage entry
1920
+
1921
+ Returns:
1922
+ DataStorageResponse containing the created link storage entry
1923
+
1924
+ Raises:
1925
+ DataStorageCreationError: If there's an error creating the link storage entry
1926
+ """
1927
+ try:
1928
+ link_metadata = metadata.copy() if metadata else {}
1929
+ link_metadata["instructions"] = instructions
1930
+ if api_key:
1931
+ link_metadata["api_key"] = api_key
1932
+
1933
+ existing_location = DataStorageLocationPayload(
1934
+ storage_type=DataStorageType.LINK,
1935
+ content_type=DataContentType.TEXT,
1936
+ location=str(url),
1937
+ metadata=link_metadata or None,
1938
+ )
1939
+
1940
+ payload = DataStorageRequestPayload(
1941
+ name=name,
1942
+ content=str(url),
1943
+ description=description,
1944
+ dataset_id=dataset_id,
1945
+ project_id=project_id,
1946
+ existing_location=existing_location,
1947
+ tags=tags,
1948
+ metadata=metadata,
1949
+ parent_id=parent_id,
1950
+ )
1951
+ return self._create_data_storage_entry(payload)
1952
+ except HTTPStatusError as e:
1953
+ self._handle_http_errors(e, "creating")
1954
+ except Exception as e:
1955
+ raise DataStorageCreationError(
1956
+ f"An unexpected error occurred: {e!r}"
1957
+ ) from e
1958
+
1959
+ @retry(
1960
+ stop=stop_after_attempt(3),
1961
+ wait=wait_exponential(multiplier=1, max=10),
1962
+ retry=retry_if_connection_error,
1963
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1964
+ )
1965
+ def store_file_content(
1966
+ self,
1967
+ name: str,
1968
+ file_path: str | Path,
1969
+ description: str | None = None,
1970
+ file_path_override: str | Path | None = None,
1971
+ as_collection: bool = False,
1972
+ manifest_filename: str | None = None,
1973
+ ignore_patterns: list[str] | None = None,
1974
+ ignore_filename: str = ".gitignore",
1975
+ project_id: UUID | None = None,
1976
+ dataset_id: UUID | None = None,
1977
+ metadata: dict[str, Any] | None = None,
1978
+ tags: list[str] | None = None,
1979
+ parent_id: UUID | None = None,
1980
+ ) -> DataStorageResponse:
1981
+ """Store file or directory content in the data storage system.
1982
+
1983
+ For files: Small text files (< 10MB, supported formats) are sent as text content,
1984
+ larger/binary files are uploaded via signed URL.
1985
+
1986
+ For directories: Zipped as a single file with ignore pattern support and uploaded
1987
+ as a collection.
1988
+
1989
+ Args:
1990
+ name: Name of the data storage entry
1991
+ file_path: Path to file or directory to upload
1992
+ description: Optional description of the data storage entry
1993
+ file_path_override: Optional path for the data storage entry
1994
+ as_collection: If true, upload directories as a single zip file collection.
1995
+ manifest_filename: Name of manifest file (JSON or YAML) containing:
1996
+ - entries - Map of file/directory names to their manifest entries
1997
+ - Each ManifestEntry contains:
1998
+ - description - Description of the file or directory
1999
+ - metadata - Additional metadata for the entry
2000
+ - Each DirectoryManifest contains nested entries following the same structure
2001
+ ignore_patterns: List of patterns to ignore when zipping directories
2002
+ ignore_filename: Name of ignore file to read from directory (default: .gitignore)
2003
+ project_id: ID of the project this data storage entry belongs to
2004
+ dataset_id: ID of the dataset this data storage entry belongs to
2005
+ metadata: Optional metadata for the data storage entry
2006
+ tags: Optional tags for the data storage entry
2007
+ parent_id: Optional parent ID for the data storage entry
2008
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
2009
+
2010
+ Returns:
2011
+ DataStorageResponse: A Pydantic model containing:
2012
+ - data_storage: DataStorageEntry with fields:
2013
+ - id - Unique identifier for the data storage entry
2014
+ - name - Name of the data storage entry
2015
+ - description - Description of the data storage entry
2016
+ - content - Content of the data storage entry
2017
+ - embedding - Embedding vector for the content
2018
+ - is_collection - Whether this entry is a collection
2019
+ - tags - List of tags associated with the entry
2020
+ - parent_id - ID of the parent entry for hierarchical storage
2021
+ - project_id - ID of the project this entry belongs to
2022
+ - dataset_id - ID of the dataset this entry belongs to
2023
+ - file_path - Filepath in the storage system where this entry is located
2024
+ - bigquery_schema - Target BigQuery schema for the entry
2025
+ - user_id - ID of the user who created this entry
2026
+ - created_at - Timestamp when the entry was created
2027
+ - modified_at - Timestamp when the entry was last updated
2028
+ - storage_locations with each location containing:
2029
+ - id - Unique identifier for the storage location
2030
+ - data_storage_id - ID of the associated data storage entry
2031
+ - storage_config pydantic model with fields:
2032
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
2033
+ - content_type - Type of content stored
2034
+ - content_schema - Content schema
2035
+ - metadata - Location metadata
2036
+ - location - Location path or identifier
2037
+ - signed_url - Signed URL for uploading/downloading
2038
+
2039
+ Raises:
2040
+ DataStorageCreationError: If there's an error in the process
2041
+ """
2042
+ file_path = self._validate_file_path(file_path)
2043
+
2044
+ try:
2045
+ if file_path.is_dir() and as_collection:
2046
+ return self._upload_data_directory(
2047
+ name=name,
2048
+ dir_path=file_path,
2049
+ description=description,
2050
+ dir_path_override=file_path_override,
2051
+ ignore_patterns=ignore_patterns,
2052
+ ignore_filename=ignore_filename,
2053
+ project_id=project_id,
2054
+ dataset_id=dataset_id,
2055
+ parent_id=parent_id,
2056
+ metadata=metadata,
2057
+ tags=tags,
2058
+ )
2059
+ if file_path.is_dir() and not as_collection:
2060
+ responses = self._upload_directory_hierarchically(
2061
+ name=name,
2062
+ dir_path=file_path,
2063
+ description=description,
2064
+ manifest_filename=manifest_filename,
2065
+ ignore_patterns=ignore_patterns,
2066
+ ignore_filename=ignore_filename,
2067
+ project_id=project_id,
2068
+ dataset_id=dataset_id,
2069
+ parent_id=parent_id,
2070
+ metadata=metadata,
2071
+ tags=tags,
2072
+ )
2073
+ if not responses:
2074
+ raise DataStorageCreationError(
2075
+ "No data storage entries were created"
2076
+ )
2077
+ return responses[0]
2078
+ return self._upload_data_single_file(
2079
+ name, file_path, description, file_path_override, project_id
2080
+ )
2081
+
2082
+ except HTTPStatusError as e:
2083
+ self._handle_http_errors(e, "creating")
2084
+ except Exception as e:
2085
+ raise DataStorageCreationError(
2086
+ f"An unexpected error occurred during file upload: {e!r}"
2087
+ ) from e
2088
+
2089
+ @retry(
2090
+ stop=stop_after_attempt(3),
2091
+ wait=wait_exponential(multiplier=1, max=10),
2092
+ retry=retry_if_connection_error,
2093
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2094
+ )
2095
+ async def astore_file_content(
2096
+ self,
2097
+ name: str,
2098
+ file_path: str | Path,
2099
+ description: str | None = None,
2100
+ file_path_override: str | Path | None = None,
2101
+ as_collection: bool = False,
2102
+ manifest_filename: str | None = None,
2103
+ ignore_patterns: list[str] | None = None,
2104
+ ignore_filename: str = ".gitignore",
2105
+ dataset_id: UUID | None = None,
2106
+ project_id: UUID | None = None,
2107
+ metadata: dict[str, Any] | None = None,
2108
+ tags: list[str] | None = None,
2109
+ parent_id: UUID | None = None,
2110
+ ) -> DataStorageResponse:
2111
+ """Asynchronously store file or directory content in the data storage system.
2112
+
2113
+ Args:
2114
+ name: Name of the data storage entry.
2115
+ file_path: Path to the file or directory to upload.
2116
+ description: Optional description for the entry.
2117
+ file_path_override: Optional GCS path for the entry.
2118
+ as_collection: If uploading a directory, `True` zips it into a single collection,
2119
+ `False` uploads it as a hierarchical structure of individual objects.
2120
+ manifest_filename: Optional manifest file (JSON or YAML) for hierarchical uploads containing:
2121
+ - entries - Map of file/directory names to their manifest entries
2122
+ - Each ManifestEntry contains:
2123
+ - description - Description of the file or directory
2124
+ - metadata - Additional metadata for the entry
2125
+ - Each DirectoryManifest contains nested entries following the same structure
2126
+ ignore_patterns: List of patterns to ignore when zipping.
2127
+ ignore_filename: Name of ignore file to read (default: .gitignore).
2128
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset.
2129
+ project_id: ID of the project this data storage entry belongs to
2130
+ metadata: Optional metadata for the data storage entry
2131
+ tags: Optional tags for the data storage entry
2132
+ parent_id: Optional parent ID for the data storage entry
2133
+
2134
+ Returns:
2135
+ DataStorageResponse: A Pydantic model containing:
2136
+ - data_storage: DataStorageEntry with fields:
2137
+ - id - Unique identifier for the data storage entry
2138
+ - name - Name of the data storage entry
2139
+ - description - Description of the data storage entry
2140
+ - content - Content of the data storage entry
2141
+ - embedding - Embedding vector for the content
2142
+ - is_collection - Whether this entry is a collection
2143
+ - tags - List of tags associated with the entry
2144
+ - parent_id - ID of the parent entry for hierarchical storage
2145
+ - project_id - ID of the project this entry belongs to
2146
+ - dataset_id - ID of the dataset this entry belongs to
2147
+ - file_path - Filepath in the storage system where this entry is located
2148
+ - bigquery_schema - Target BigQuery schema for the entry
2149
+ - user_id - ID of the user who created this entry
2150
+ - created_at - Timestamp when the entry was created
2151
+ - modified_at - Timestamp when the entry was last updated
2152
+ - storage_locations with each location containing:
2153
+ - id - Unique identifier for the storage location
2154
+ - data_storage_id - ID of the associated data storage entry
2155
+ - storage_config pydantic model with fields:
2156
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
2157
+ - content_type - Type of content stored
2158
+ - content_schema - Content schema
2159
+ - metadata - Location metadata
2160
+ - location - Location path or identifier
2161
+ - signed_url - Signed URL for uploading/downloading
2162
+
2163
+ For hierarchical uploads, this is the response for the root directory entry.
2164
+ """
2165
+ file_path = self._validate_file_path(file_path)
2166
+
2167
+ try:
2168
+ if file_path.is_dir():
2169
+ if as_collection:
2170
+ return await self._aupload_data_directory(
2171
+ name=name,
2172
+ dir_path=file_path,
2173
+ description=description,
2174
+ dir_path_override=file_path_override,
2175
+ ignore_patterns=ignore_patterns,
2176
+ ignore_filename=ignore_filename,
2177
+ project_id=project_id,
2178
+ metadata=metadata,
2179
+ tags=tags,
2180
+ dataset_id=dataset_id,
2181
+ parent_id=parent_id,
2182
+ )
2183
+ responses = await self._aupload_directory_hierarchically(
2184
+ name=name,
2185
+ dir_path=file_path,
2186
+ description=description,
2187
+ manifest_filename=manifest_filename,
2188
+ ignore_patterns=ignore_patterns,
2189
+ ignore_filename=ignore_filename,
2190
+ dataset_id=dataset_id,
2191
+ project_id=project_id,
2192
+ metadata=metadata,
2193
+ tags=tags,
2194
+ parent_id=parent_id,
2195
+ )
2196
+ if not responses:
2197
+ raise DataStorageCreationError(
2198
+ "No data storage entries were created"
2199
+ )
2200
+ return responses[0]
2201
+ return await self._aupload_data_single_file(
2202
+ name, file_path, description, file_path_override, dataset_id, project_id
2203
+ )
2204
+
2205
+ except HTTPStatusError as e:
2206
+ self._handle_http_errors(e, "creating")
2207
+ except Exception as e:
2208
+ raise DataStorageCreationError(
2209
+ f"An unexpected error occurred during async file upload: {e!r}"
2210
+ ) from e
2211
+
2212
+ @retry(
2213
+ stop=stop_after_attempt(3),
2214
+ wait=wait_exponential(multiplier=1, max=10),
2215
+ retry=retry_if_connection_error,
2216
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2217
+ )
2218
+ def register_existing_data_source(
2219
+ self,
2220
+ name: str,
2221
+ existing_location: DataStorageLocationPayload,
2222
+ description: str | None = None,
2223
+ as_collection: bool = False,
2224
+ project_id: UUID | None = None,
2225
+ metadata: dict[str, Any] | None = None,
2226
+ tags: list[str] | None = None,
2227
+ parent_id: UUID | None = None,
2228
+ dataset_id: UUID | None = None,
2229
+ ) -> DataStorageResponse:
2230
+ """Store content as a string in the data storage system.
2231
+
2232
+ Args:
2233
+ name: Name of the data storage entry
2234
+ existing_location: a pydantic model describing the existing data source location to register, containing:
2235
+ - storage_type - Type of storage (BIGQUERY, GCS, PG_TABLE, RAW_CONTENT, ELASTIC_SEARCH)
2236
+ - content_type - Type of content (BQ_DATASET, BQ_TABLE, TEXT, TEXT_W_EMBEDDINGS, DIRECTORY, FILE, INDEX, INDEX_W_EMBEDDINGS)
2237
+ - content_schema - Content schema for the data
2238
+ - metadata - Additional metadata for the location
2239
+ - location - Location path or identifier
2240
+ description: Optional description of the data storage entry
2241
+ as_collection: If uploading a directory, `True` creates a single storage entry for
2242
+ the whole directory and multiple storage locations for each file, `False` assumes
2243
+ you are uploading a single file.
2244
+ file_path: Optional path for the data storage entry
2245
+ project_id: ID of the project this data storage entry belongs to
2246
+ metadata: Optional metadata for the data storage entry
2247
+ tags: Optional tags for the data storage entry
2248
+ parent_id: Optional parent ID for the data storage entry
2249
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
2250
+
2251
+ Returns:
2252
+ DataStorageResponse: A Pydantic model containing:
2253
+ - data_storage: DataStorageEntry with fields:
2254
+ - id - Unique identifier for the data storage entry
2255
+ - name - Name of the data storage entry
2256
+ - description - Description of the data storage entry
2257
+ - content - Content of the data storage entry
2258
+ - embedding - Embedding vector for the content
2259
+ - is_collection - Whether this entry is a collection
2260
+ - tags - List of tags associated with the entry
2261
+ - parent_id - ID of the parent entry for hierarchical storage
2262
+ - project_id - ID of the project this entry belongs to
2263
+ - dataset_id - ID of the dataset this entry belongs to
2264
+ - file_path - Filepath in the storage system where this entry is located
2265
+ - bigquery_schema - Target BigQuery schema for the entry
2266
+ - user_id - ID of the user who created this entry
2267
+ - created_at - Timestamp when the entry was created
2268
+ - modified_at - Timestamp when the entry was last updated
2269
+ - storage_locations with each location containing:
2270
+ - id - Unique identifier for the storage location
2271
+ - data_storage_id - ID of the associated data storage entry
2272
+ - storage_config pydantic model with fields:
2273
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
2274
+ - content_type - Type of content stored
2275
+ - content_schema - Content schema
2276
+ - metadata - Location metadata
2277
+ - location - Location path or identifier
2278
+ - signed_url - Signed URL for uploading/downloading
2279
+
2280
+ Raises:
2281
+ DataStorageCreationError: If there's an error creating the data storage entry
2282
+ """
2283
+ try:
2284
+ payload = DataStorageRequestPayload(
2285
+ name=name,
2286
+ description=description,
2287
+ existing_location=existing_location,
2288
+ project_id=project_id,
2289
+ is_collection=as_collection,
2290
+ metadata=metadata,
2291
+ tags=tags,
2292
+ parent_id=parent_id,
2293
+ dataset_id=dataset_id,
2294
+ )
2295
+ response = self.client.post(
2296
+ "/v0.1/data-storage/data-entries",
2297
+ json=payload.model_dump(exclude_none=True),
2298
+ )
2299
+ response.raise_for_status()
2300
+ return DataStorageResponse.model_validate(response.json())
2301
+ except HTTPStatusError as e:
2302
+ self._handle_http_errors(e, "creating")
2303
+ except Exception as e:
2304
+ raise DataStorageCreationError(
2305
+ f"An unexpected error occurred: {e!r}"
2306
+ ) from e
2307
+
2308
+ @retry(
2309
+ stop=stop_after_attempt(3),
2310
+ wait=wait_exponential(multiplier=1, max=10),
2311
+ retry=retry_if_connection_error,
2312
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2313
+ )
2314
+ async def aregister_existing_data_source(
2315
+ self,
2316
+ name: str,
2317
+ existing_location: DataStorageLocationPayload,
2318
+ as_collection: bool = False,
2319
+ description: str | None = None,
2320
+ project_id: UUID | None = None,
2321
+ metadata: dict[str, Any] | None = None,
2322
+ tags: list[str] | None = None,
2323
+ parent_id: UUID | None = None,
2324
+ dataset_id: UUID | None = None,
2325
+ ) -> DataStorageResponse:
2326
+ """Store content as a string in the data storage system.
2327
+
2328
+ Args:
2329
+ name: Name of the data storage entry
2330
+ existing_location: a pydantic model describing the existing data source location to register, containing:
2331
+ - storage_type - Type of storage (BIGQUERY, GCS, PG_TABLE, RAW_CONTENT, ELASTIC_SEARCH)
2332
+ - content_type - Type of content (BQ_DATASET, BQ_TABLE, TEXT, TEXT_W_EMBEDDINGS, DIRECTORY, FILE, INDEX, INDEX_W_EMBEDDINGS)
2333
+ - content_schema - Content schema for the data
2334
+ - metadata - Additional metadata for the location
2335
+ - location - Location path or identifier
2336
+ description: Optional description of the data storage entry
2337
+ as_collection: If uploading a directory, `True` creates a single storage entry for
2338
+ the whole directory and multiple storage locations for each file, `False` assumes
2339
+ you are uploading a single file.
2340
+ file_path: Optional path for the data storage entry
2341
+ project_id: ID of the project this data storage entry belongs to
2342
+ metadata: Optional metadata for the data storage entry
2343
+ tags: Optional tags for the data storage entry
2344
+ parent_id: Optional parent ID for the data storage entry
2345
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
2346
+
2347
+ Returns:
2348
+ DataStorageResponse: A Pydantic model containing:
2349
+ - data_storage: DataStorageEntry with fields:
2350
+ - id - Unique identifier for the data storage entry
2351
+ - name - Name of the data storage entry
2352
+ - description - Description of the data storage entry
2353
+ - content - Content of the data storage entry
2354
+ - embedding - Embedding vector for the content
2355
+ - is_collection - Whether this entry is a collection
2356
+ - tags - List of tags associated with the entry
2357
+ - parent_id - ID of the parent entry for hierarchical storage
2358
+ - project_id - ID of the project this entry belongs to
2359
+ - dataset_id - ID of the dataset this entry belongs to
2360
+ - file_path - Filepath in the storage system where this entry is located
2361
+ - bigquery_schema - Target BigQuery schema for the entry
2362
+ - user_id - ID of the user who created this entry
2363
+ - created_at - Timestamp when the entry was created
2364
+ - modified_at - Timestamp when the entry was last updated
2365
+ - storage_locations with each location containing:
2366
+ - id - Unique identifier for the storage location
2367
+ - data_storage_id - ID of the associated data storage entry
2368
+ - storage_config pydantic model with fields:
2369
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
2370
+ - content_type - Type of content stored
2371
+ - content_schema - Content schema
2372
+ - metadata - Location metadata
2373
+ - location - Location path or identifier
2374
+ - signed_url - Signed URL for uploading/downloading
2375
+
2376
+ Raises:
2377
+ DataStorageCreationError: If there's an error creating the data storage entry
2378
+ """
2379
+ try:
2380
+ payload = DataStorageRequestPayload(
2381
+ name=name,
2382
+ description=description,
2383
+ existing_location=existing_location,
2384
+ project_id=project_id,
2385
+ is_collection=as_collection,
2386
+ metadata=metadata,
2387
+ tags=tags,
2388
+ parent_id=parent_id,
2389
+ dataset_id=dataset_id,
2390
+ )
2391
+ response = await self.async_client.post(
2392
+ "/v0.1/data-storage/data-entries",
2393
+ json=payload.model_dump(exclude_none=True),
2394
+ )
2395
+ response.raise_for_status()
2396
+ return DataStorageResponse.model_validate(response.json())
2397
+ except HTTPStatusError as e:
2398
+ self._handle_http_errors(e, "creating")
2399
+ except Exception as e:
2400
+ raise DataStorageCreationError(
2401
+ f"An unexpected error occurred: {e!r}"
2402
+ ) from e
2403
+
2404
+ @retry(
2405
+ stop=stop_after_attempt(3),
2406
+ wait=wait_exponential(multiplier=1, max=10),
2407
+ retry=retry_if_connection_error,
2408
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2409
+ )
2410
+ def search_data_storage(
2411
+ self,
2412
+ criteria: list[SearchCriterion] | None = None,
2413
+ limit: int = 10,
2414
+ offset: int = 0,
2415
+ filter_logic: FilterLogic = FilterLogic.OR,
2416
+ ) -> list[dict]:
2417
+ """Search data storage objects using structured criteria.
2418
+
2419
+ Args:
2420
+ criteria: List of SearchCriterion pydantic models with fields:
2421
+ - field - Field name to search on
2422
+ - operator - Search operator (EQUALS, CONTAINS, STARTS_WITH, ENDS_WITH, GREATER_THAN, LESS_THAN, BETWEEN, IN)
2423
+ - value - Value to search for
2424
+ limit: Number of results to return (1-100)
2425
+ offset: Number of results to skip
2426
+ filter_logic: Either "AND" (all criteria must match) or "OR" (at least one must match)
2427
+
2428
+ Returns:
2429
+ List of search results with scores and data storage information
2430
+
2431
+ Raises:
2432
+ DataStorageCreationError: If there's an error searching data storage entries
2433
+
2434
+ Example:
2435
+ from edison_client.models.rest import SearchCriterion, SearchOperator
2436
+ criteria = [
2437
+ SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
2438
+ SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
2439
+ SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
2440
+ ]
2441
+ results = client.search_data_storage(criteria=criteria, size=20)
2442
+ """
2443
+ try:
2444
+ payload = DataStorageSearchPayload(
2445
+ criteria=criteria or [],
2446
+ limit=max(1, min(100, limit)), # Clamp between 1-100
2447
+ offset=offset,
2448
+ filter_logic=filter_logic,
2449
+ )
2450
+
2451
+ response = self.client.post(
2452
+ "/v0.1/data-storage/search",
2453
+ json=payload.model_dump(mode="json"),
2454
+ )
2455
+ response.raise_for_status()
2456
+ return response.json()
2457
+
2458
+ except HTTPStatusError as e:
2459
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2460
+ raise DataStorageCreationError(
2461
+ "Search functionality is currently unavailable"
2462
+ ) from e
2463
+ self._handle_http_errors(e, "searching")
2464
+ except Exception as e:
2465
+ raise DataStorageCreationError(
2466
+ f"An unexpected error occurred during search: {e!r}"
2467
+ ) from e
2468
+
2469
+ @retry(
2470
+ stop=stop_after_attempt(3),
2471
+ wait=wait_exponential(multiplier=1, max=10),
2472
+ retry=retry_if_connection_error,
2473
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2474
+ )
2475
+ async def asearch_data_storage(
2476
+ self,
2477
+ criteria: list[SearchCriterion] | None = None,
2478
+ limit: int = 10,
2479
+ offset: int = 0,
2480
+ filter_logic: FilterLogic = FilterLogic.OR,
2481
+ ) -> list[dict]:
2482
+ """Asynchronously search data storage objects using structured criteria.
2483
+
2484
+ Args:
2485
+ criteria: List of SearchCriterion pydantic models with fields:
2486
+ - field - Field name to search on
2487
+ - operator - Search operator (EQUALS, CONTAINS, STARTS_WITH, ENDS_WITH, GREATER_THAN, LESS_THAN, BETWEEN, IN)
2488
+ - value - Value to search for
2489
+ limit: Number of results to return (1-100)
2490
+ offset: Number of results to skip
2491
+ filter_logic: Either "AND" (all criteria must match) or "OR" (at least one must match)
2492
+
2493
+ Returns:
2494
+ List of search results with scores and data storage information
2495
+
2496
+ Raises:
2497
+ DataStorageCreationError: If there's an error searching data storage entries
2498
+
2499
+ Example:
2500
+ from edison_client.models.rest import SearchCriterion, SearchOperator
2501
+ criteria = [
2502
+ SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
2503
+ SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
2504
+ SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
2505
+ ]
2506
+ results = await client.asearch_data_storage(criteria=criteria, size=20)
2507
+ """
2508
+ try:
2509
+ payload = DataStorageSearchPayload(
2510
+ criteria=criteria or [],
2511
+ limit=max(1, min(100, limit)), # Clamp between 1-100
2512
+ offset=offset,
2513
+ filter_logic=filter_logic,
2514
+ )
2515
+
2516
+ response = await self.async_client.post(
2517
+ "/v0.1/data-storage/search",
2518
+ json=payload.model_dump(mode="json"),
2519
+ )
2520
+ response.raise_for_status()
2521
+ return response.json()
2522
+
2523
+ except HTTPStatusError as e:
2524
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2525
+ raise DataStorageCreationError(
2526
+ "Search functionality is currently unavailable"
2527
+ ) from e
2528
+ self._handle_http_errors(e, "searching")
2529
+ except Exception as e:
2530
+ raise DataStorageCreationError(
2531
+ f"An unexpected error occurred during async search: {e!r}"
2532
+ ) from e
2533
+
2534
+ @retry(
2535
+ stop=stop_after_attempt(3),
2536
+ wait=wait_exponential(multiplier=1, max=10),
2537
+ retry=retry_if_connection_error,
2538
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2539
+ )
2540
+ def similarity_search_data_storage(
2541
+ self,
2542
+ embedding: list[float],
2543
+ size: int = 10,
2544
+ min_score: float = 0.7,
2545
+ dataset_id: UUID | None = None,
2546
+ tags: list[str] | None = None,
2547
+ user_id: str | None = None,
2548
+ project_id: str | None = None,
2549
+ ) -> list[dict]:
2550
+ """Search data storage objects using vector similarity.
2551
+
2552
+ Args:
2553
+ embedding: List of float values representing the embedding vector for similarity search
2554
+ size: Number of results to return (1-100)
2555
+ min_score: Minimum similarity score (0.0-1.0)
2556
+ dataset_id: Optional dataset ID filter
2557
+ tags: Optional list of string tags to filter by
2558
+ user_id: Optional user ID filter (admin only)
2559
+ project_id: Optional project ID filter
2560
+
2561
+ Returns:
2562
+ List of search results with similarity scores and data storage information
2563
+
2564
+ Raises:
2565
+ DataStorageCreationError: If there's an error performing similarity search
2566
+ """
2567
+ try:
2568
+ # Validate inputs
2569
+ if not embedding:
2570
+ raise DataStorageCreationError("Embedding vector is required")
2571
+
2572
+ if not all(isinstance(x, int | float) for x in embedding):
2573
+ raise DataStorageCreationError("Embedding must be a list of numbers")
2574
+
2575
+ size = max(1, min(100, size)) # Clamp between 1-100
2576
+ min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
2577
+
2578
+ # Build request payload
2579
+ payload = {
2580
+ "embedding": embedding,
2581
+ "size": size,
2582
+ "min_score": min_score,
2583
+ }
2584
+
2585
+ # Add optional filters
2586
+ if dataset_id is not None:
2587
+ payload["dataset_id"] = str(dataset_id)
2588
+ if tags is not None:
2589
+ payload["tags"] = tags
2590
+ if user_id is not None:
2591
+ payload["user_id"] = user_id
2592
+ if project_id is not None:
2593
+ payload["project_id"] = project_id
2594
+
2595
+ response = self.client.post(
2596
+ "/v0.1/data-storage/similarity-search", json=payload
2597
+ )
2598
+ response.raise_for_status()
2599
+ return response.json()
2600
+
2601
+ except HTTPStatusError as e:
2602
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2603
+ raise DataStorageCreationError(
2604
+ "Similarity search functionality is currently unavailable"
2605
+ ) from e
2606
+ self._handle_http_errors(e, "performing similarity search")
2607
+ except Exception as e:
2608
+ raise DataStorageCreationError(
2609
+ f"An unexpected error occurred during similarity search: {e!r}"
2610
+ ) from e
2611
+
2612
+ @retry(
2613
+ stop=stop_after_attempt(3),
2614
+ wait=wait_exponential(multiplier=1, max=10),
2615
+ retry=retry_if_connection_error,
2616
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2617
+ )
2618
+ async def asimilarity_search_data_storage(
2619
+ self,
2620
+ embedding: list[float],
2621
+ size: int = 10,
2622
+ min_score: float = 0.7,
2623
+ dataset_id: UUID | None = None,
2624
+ tags: list[str] | None = None,
2625
+ user_id: str | None = None,
2626
+ project_id: str | None = None,
2627
+ ) -> list[dict]:
2628
+ """Asynchronously search data storage objects using vector similarity.
2629
+
2630
+ Args:
2631
+ embedding: List of float values representing the embedding vector for similarity search
2632
+ size: Number of results to return (1-100)
2633
+ min_score: Minimum similarity score (0.0-1.0)
2634
+ dataset_id: Optional dataset ID filter
2635
+ tags: Optional list of string tags to filter by
2636
+ user_id: Optional user ID filter (admin only)
2637
+ project_id: Optional project ID filter
2638
+
2639
+ Returns:
2640
+ List of search results with similarity scores and data storage information
2641
+
2642
+ Raises:
2643
+ DataStorageCreationError: If there's an error performing similarity search
2644
+ """
2645
+ try:
2646
+ # Validate inputs
2647
+ if not embedding:
2648
+ raise DataStorageCreationError("Embedding vector is required")
2649
+
2650
+ if not all(isinstance(x, int | float) for x in embedding):
2651
+ raise DataStorageCreationError("Embedding must be a list of numbers")
2652
+
2653
+ size = max(1, min(100, size)) # Clamp between 1-100
2654
+ min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
2655
+
2656
+ # Build request payload
2657
+ payload = {
2658
+ "embedding": embedding,
2659
+ "size": size,
2660
+ "min_score": min_score,
2661
+ }
2662
+
2663
+ # Add optional filters
2664
+ if dataset_id is not None:
2665
+ payload["dataset_id"] = str(dataset_id)
2666
+ if tags is not None:
2667
+ payload["tags"] = tags
2668
+ if user_id is not None:
2669
+ payload["user_id"] = user_id
2670
+ if project_id is not None:
2671
+ payload["project_id"] = project_id
2672
+
2673
+ response = await self.async_client.post(
2674
+ "/v0.1/data-storage/similarity-search", json=payload
2675
+ )
2676
+ response.raise_for_status()
2677
+ return response.json()
2678
+
2679
+ except HTTPStatusError as e:
2680
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2681
+ raise DataStorageCreationError(
2682
+ "Similarity search functionality is currently unavailable"
2683
+ ) from e
2684
+ self._handle_http_errors(e, "performing similarity search")
2685
+ except Exception as e:
2686
+ raise DataStorageCreationError(
2687
+ f"An unexpected error occurred during async similarity search: {e!r}"
2688
+ ) from e
2689
+
2690
+ @retry(
2691
+ stop=stop_after_attempt(3),
2692
+ wait=wait_exponential(multiplier=1, max=10),
2693
+ retry=retry_if_connection_error,
2694
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2695
+ )
2696
+ def fetch_data_from_storage(
2697
+ self,
2698
+ data_storage_id: UUID | None = None,
2699
+ ) -> RawFetchResponse | Path | list[Path] | None:
2700
+ """Fetch data from the storage system (sync version).
2701
+
2702
+ Args:
2703
+ data_storage_id: UUID of the data storage entry to fetch
2704
+
2705
+ Returns:
2706
+ For PG_TABLE storage: string content
2707
+ For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
2708
+ For multi-location entries: list of downloaded files
2709
+ None if not found or error occurred
2710
+ """
2711
+ if not data_storage_id:
2712
+ raise DataStorageRetrievalError(
2713
+ "data_storage_id must be provided at this time"
2714
+ )
2715
+
2716
+ try:
2717
+ response = self.client.get(
2718
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
2719
+ )
2720
+ response.raise_for_status()
2721
+ result = DataStorageResponse.model_validate(response.json())
2722
+
2723
+ if len(result.storage_locations) > 1:
2724
+ return [
2725
+ self._download_from_gcs(
2726
+ location.storage_config.signed_url or "",
2727
+ (
2728
+ Path(location.storage_config.location).name
2729
+ if location.storage_config.location
2730
+ else None
2731
+ ),
2732
+ )
2733
+ for location in result.storage_locations
2734
+ ]
2735
+
2736
+ # Most scenarios will only have one location
2737
+ storage_location = result.storage_locations[0]
2738
+ storage_type = storage_location.storage_config.storage_type
2739
+
2740
+ if storage_type == "gcs":
2741
+ if not storage_location.storage_config.signed_url:
2742
+ raise DataStorageRetrievalError(
2743
+ "No signed URL available for GCS download"
2744
+ )
2745
+
2746
+ return self._download_from_gcs(
2747
+ storage_location.storage_config.signed_url,
2748
+ (
2749
+ Path(storage_location.storage_config.location).name
2750
+ if storage_location.storage_config.location
2751
+ else None
2752
+ ),
2753
+ )
2754
+
2755
+ if storage_type in {"raw_content", "pg_table"}:
2756
+ content = result.data_storage.content
2757
+ if content is None:
2758
+ logger.warning(
2759
+ f"No content found for data storage entry {data_storage_id}"
2760
+ )
2761
+ return None
2762
+
2763
+ if result.data_storage.file_path:
2764
+ return RawFetchResponse(
2765
+ filename=Path(result.data_storage.file_path),
2766
+ content=content,
2767
+ entry_id=result.data_storage.id,
2768
+ entry_name=result.data_storage.name,
2769
+ )
2770
+
2771
+ return RawFetchResponse(
2772
+ content=content,
2773
+ entry_id=result.data_storage.id,
2774
+ entry_name=result.data_storage.name,
2775
+ )
2776
+
2777
+ raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
2778
+
2779
+ except HTTPStatusError as e:
2780
+ self._handle_http_errors(e, "retrieving")
2781
+ except Exception as e:
2782
+ raise DataStorageRetrievalError(
2783
+ f"An unexpected error occurred: {e!r}"
2784
+ ) from e
2785
+
2786
+ @retry(
2787
+ stop=stop_after_attempt(3),
2788
+ wait=wait_exponential(multiplier=1, max=10),
2789
+ retry=retry_if_connection_error,
2790
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2791
+ )
2792
+ async def afetch_data_from_storage(
2793
+ self,
2794
+ data_storage_id: UUID | None = None,
2795
+ ) -> RawFetchResponse | Path | list[Path] | None:
2796
+ """Fetch data from the storage system.
2797
+
2798
+ Args:
2799
+ data_storage_id: UUID of the data storage entry to fetch
2800
+
2801
+ Returns:
2802
+ For PG_TABLE storage: string content
2803
+ For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
2804
+ For multi-location entries: list of downloaded files
2805
+ None if not found or error occurred
2806
+ """
2807
+ if not data_storage_id:
2808
+ raise DataStorageRetrievalError(
2809
+ "data_storage_id must be provided at this time"
2810
+ )
2811
+
2812
+ try:
2813
+ response = await self.async_client.get(
2814
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
2815
+ )
2816
+ response.raise_for_status()
2817
+ result = DataStorageResponse.model_validate(response.json())
2818
+
2819
+ if len(result.storage_locations) > 1:
2820
+ return await gather_with_concurrency(
2821
+ DOWNLOAD_CONCURRENCY,
2822
+ [
2823
+ self._adownload_from_gcs(
2824
+ location.storage_config.signed_url or "",
2825
+ (
2826
+ location.storage_config.location.split("/")[-1]
2827
+ if location.storage_config.location
2828
+ else None
2829
+ ),
2830
+ )
2831
+ for location in result.storage_locations
2832
+ ],
2833
+ )
2834
+
2835
+ # Most scenarios will only have one location
2836
+ storage_location = result.storage_locations[0]
2837
+ storage_type = storage_location.storage_config.storage_type
2838
+
2839
+ if storage_type == "gcs":
2840
+ if not storage_location.storage_config.signed_url:
2841
+ raise DataStorageRetrievalError(
2842
+ "No signed URL available for GCS download"
2843
+ )
2844
+
2845
+ return await self._adownload_from_gcs(
2846
+ storage_location.storage_config.signed_url,
2847
+ (
2848
+ storage_location.storage_config.location.split("/")[-1]
2849
+ if storage_location.storage_config.location
2850
+ else None
2851
+ ),
2852
+ )
2853
+
2854
+ if storage_type in {"raw_content", "pg_table"}:
2855
+ content = result.data_storage.content
2856
+ if content is None:
2857
+ logger.warning(
2858
+ f"No content found for data storage entry {data_storage_id}"
2859
+ )
2860
+ return None
2861
+
2862
+ if result.data_storage.file_path:
2863
+ return RawFetchResponse(
2864
+ filename=Path(result.data_storage.file_path),
2865
+ content=content,
2866
+ entry_id=result.data_storage.id,
2867
+ entry_name=result.data_storage.name,
2868
+ )
2869
+
2870
+ return RawFetchResponse(
2871
+ content=content,
2872
+ entry_id=result.data_storage.id,
2873
+ entry_name=result.data_storage.name,
2874
+ )
2875
+
2876
+ raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
2877
+
2878
+ except HTTPStatusError as e:
2879
+ self._handle_http_errors(e, "retrieving")
2880
+ except Exception as e:
2881
+ raise DataStorageRetrievalError(
2882
+ f"An unexpected error occurred: {e!r}"
2883
+ ) from e
2884
+
2885
+ @retry(
2886
+ stop=stop_after_attempt(3),
2887
+ wait=wait_exponential(multiplier=1, max=10),
2888
+ retry=retry_if_connection_error,
2889
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2890
+ )
2891
+ async def acreate_dataset(
2892
+ self,
2893
+ name: str,
2894
+ description: str | None = None,
2895
+ dataset_id: UUID | None = None,
2896
+ ) -> CreateDatasetPayload:
2897
+ """Asynchronously create a new dataset.
2898
+
2899
+ Args:
2900
+ name: Name of the dataset to create
2901
+ description: Optional description of the dataset
2902
+ dataset_id: Optional UUID to assign to the dataset, or None to auto-generate
2903
+
2904
+ Returns:
2905
+ CreateDatasetPayload: A Pydantic model containing:
2906
+ - id - ID of the created dataset (None if auto-generated)
2907
+ - name - Name of the dataset
2908
+ - description - Description of the dataset
2909
+
2910
+ Raises:
2911
+ DataStorageCreationError: If there's an error creating the dataset
2912
+ """
2913
+ try:
2914
+ payload = CreateDatasetPayload(
2915
+ name=name,
2916
+ description=description,
2917
+ id=dataset_id,
2918
+ )
2919
+ response = await self.async_client.post(
2920
+ "/v0.1/data-storage/datasets",
2921
+ json=payload.model_dump(exclude_none=True),
2922
+ )
2923
+ response.raise_for_status()
2924
+ return CreateDatasetPayload.model_validate(response.json())
2925
+ except HTTPStatusError as e:
2926
+ self._handle_http_errors(e, "creating")
2927
+ except Exception as e:
2928
+ raise DataStorageCreationError(
2929
+ f"An unexpected error occurred: {e!r}"
2930
+ ) from e
2931
+
2932
+ @retry(
2933
+ stop=stop_after_attempt(3),
2934
+ wait=wait_exponential(multiplier=1, max=10),
2935
+ retry=retry_if_connection_error,
2936
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2937
+ )
2938
+ def create_dataset(
2939
+ self,
2940
+ name: str,
2941
+ description: str | None = None,
2942
+ dataset_id: UUID | None = None,
2943
+ ) -> CreateDatasetPayload:
2944
+ """Create a new dataset.
2945
+
2946
+ Args:
2947
+ name: Name of the dataset to create
2948
+ description: Optional description of the dataset
2949
+ dataset_id: Optional UUID to assign to the dataset, or None to auto-generate
2950
+
2951
+ Returns:
2952
+ CreateDatasetPayload: A Pydantic model containing:
2953
+ - id - ID of the created dataset (None if auto-generated)
2954
+ - name - Name of the dataset
2955
+ - description - Description of the dataset
2956
+
2957
+ Raises:
2958
+ DataStorageCreationError: If there's an error creating the dataset
2959
+ """
2960
+ try:
2961
+ payload = CreateDatasetPayload(
2962
+ name=name,
2963
+ description=description,
2964
+ id=dataset_id,
2965
+ )
2966
+ response = self.client.post(
2967
+ "/v0.1/data-storage/datasets",
2968
+ json=payload.model_dump(exclude_none=True),
2969
+ )
2970
+ response.raise_for_status()
2971
+ return CreateDatasetPayload.model_validate(response.json())
2972
+ except HTTPStatusError as e:
2973
+ self._handle_http_errors(e, "creating")
2974
+ except Exception as e:
2975
+ raise DataStorageCreationError(
2976
+ f"An unexpected error occurred: {e!r}"
2977
+ ) from e
2978
+
2979
+ @retry(
2980
+ stop=stop_after_attempt(3),
2981
+ wait=wait_exponential(multiplier=1, max=10),
2982
+ retry=retry_if_connection_error,
2983
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2984
+ )
2985
+ async def adelete_dataset(self, dataset_id: UUID):
2986
+ """Delete a dataset.
2987
+
2988
+ Note: This will delete all data storage entries associated with the dataset.
2989
+
2990
+ Args:
2991
+ dataset_id: ID of the dataset to delete
2992
+
2993
+ Raises:
2994
+ DataStorageError: If there's an error deleting the dataset
2995
+ """
2996
+ try:
2997
+ await self.async_client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
2998
+ except HTTPStatusError as e:
2999
+ self._handle_http_errors(e, "deleting")
3000
+ except Exception as e:
3001
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3002
+
3003
+ @retry(
3004
+ stop=stop_after_attempt(3),
3005
+ wait=wait_exponential(multiplier=1, max=10),
3006
+ retry=retry_if_connection_error,
3007
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3008
+ )
3009
+ def delete_dataset(self, dataset_id: UUID):
3010
+ """Delete a dataset.
3011
+
3012
+ Note: This will delete all data storage entries associated with the dataset.
3013
+
3014
+ Args:
3015
+ dataset_id: ID of the dataset to delete
3016
+
3017
+ Raises:
3018
+ DataStorageError: If there's an error deleting the dataset
3019
+ """
3020
+ try:
3021
+ self.client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
3022
+ except HTTPStatusError as e:
3023
+ self._handle_http_errors(e, "deleting")
3024
+ except Exception as e:
3025
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3026
+
3027
+ @retry(
3028
+ stop=stop_after_attempt(3),
3029
+ wait=wait_exponential(multiplier=1, max=10),
3030
+ retry=retry_if_connection_error,
3031
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3032
+ )
3033
+ async def aget_dataset(self, dataset_id: UUID) -> GetDatasetAndEntriesResponse:
3034
+ """Asynchronously retrieve a dataset by ID.
3035
+
3036
+ Args:
3037
+ dataset_id: UUID of the dataset to retrieve
3038
+
3039
+ Returns:
3040
+ GetDatasetAndEntriesResponse: A dict containing:
3041
+ - dataset: DatasetStorage with fields:
3042
+ - id - Unique identifier for the dataset
3043
+ - name - Name of the dataset
3044
+ - user_id - ID of the user who created the dataset
3045
+ - description - Description of the dataset
3046
+ - created_at - Timestamp when the dataset was created
3047
+ - modified_at - Timestamp when the dataset was last modified
3048
+ - data_storage_entries - List of data storage entries in the dataset, each containing:
3049
+ - id - Unique identifier for the data storage entry
3050
+ - name - Name of the data storage entry
3051
+ - description - Description of the data storage entry
3052
+ - content - Content of the data storage entry
3053
+ - embedding - Embedding vector for the content
3054
+ - is_collection - Whether this entry is a collection
3055
+ - tags - List of tags associated with the entry
3056
+ - parent_id - ID of the parent entry for hierarchical storage
3057
+ - project_id - ID of the project this entry belongs to
3058
+ - dataset_id - ID of the dataset this entry belongs to
3059
+ - file_path - Filepath in the storage system where this entry is located
3060
+ - bigquery_schema - Target BigQuery schema for the entry
3061
+ - user_id - ID of the user who created this entry
3062
+ - created_at - Timestamp when the entry was created
3063
+ - modified_at - Timestamp when the entry was last updated
3064
+
3065
+ Raises:
3066
+ DataStorageError: If there's an error retrieving the dataset
3067
+ """
3068
+ try:
3069
+ response = await self.async_client.get(
3070
+ f"/v0.1/data-storage/datasets/{dataset_id}"
3071
+ )
3072
+ response.raise_for_status()
3073
+
3074
+ return GetDatasetAndEntriesResponse.model_validate(response.json())
3075
+ except HTTPStatusError as e:
3076
+ self._handle_http_errors(e, "retrieving")
3077
+ except Exception as e:
3078
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3079
+
3080
+ @retry(
3081
+ stop=stop_after_attempt(3),
3082
+ wait=wait_exponential(multiplier=1, max=10),
3083
+ retry=retry_if_connection_error,
3084
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3085
+ )
3086
+ def get_dataset(self, dataset_id: UUID) -> GetDatasetAndEntriesResponse:
3087
+ """Retrieve a dataset by ID.
3088
+
3089
+ Args:
3090
+ dataset_id: UUID of the dataset to retrieve
3091
+
3092
+ Returns:
3093
+ GetDatasetAndEntriesResponse: A dict containing:
3094
+ - dataset: DatasetStorage with fields:
3095
+ - id - Unique identifier for the dataset
3096
+ - name - Name of the dataset
3097
+ - user_id - ID of the user who created the dataset
3098
+ - description - Description of the dataset
3099
+ - created_at - Timestamp when the dataset was created
3100
+ - modified_at - Timestamp when the dataset was last modified
3101
+ - data_storage_entries - List of data storage entries in the dataset, each containing:
3102
+ - id - Unique identifier for the data storage entry
3103
+ - name - Name of the data storage entry
3104
+ - description - Description of the data storage entry
3105
+ - content - Content of the data storage entry
3106
+ - embedding - Embedding vector for the content
3107
+ - is_collection - Whether this entry is a collection
3108
+ - tags - List of tags associated with the entry
3109
+ - parent_id - ID of the parent entry for hierarchical storage
3110
+ - project_id - ID of the project this entry belongs to
3111
+ - dataset_id - ID of the dataset this entry belongs to
3112
+ - path - Path in the storage system where this entry is located
3113
+ - bigquery_schema - Target BigQuery schema for the entry
3114
+ - user_id - ID of the user who created this entry
3115
+ - created_at - Timestamp when the entry was created
3116
+ - modified_at - Timestamp when the entry was last updated
3117
+
3118
+ Raises:
3119
+ DataStorageError: If there's an error retrieving the dataset
3120
+ """
3121
+ try:
3122
+ response = self.client.get(f"/v0.1/data-storage/datasets/{dataset_id}")
3123
+ response.raise_for_status()
3124
+
3125
+ return GetDatasetAndEntriesResponse.model_validate(response.json())
3126
+ except HTTPStatusError as e:
3127
+ self._handle_http_errors(e, "retrieving")
3128
+ except Exception as e:
3129
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3130
+
3131
+ @retry(
3132
+ stop=stop_after_attempt(3),
3133
+ wait=wait_exponential(multiplier=1, max=10),
3134
+ retry=retry_if_connection_error,
3135
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3136
+ )
3137
+ def get_data_storage_entry(self, data_storage_id: UUID) -> DataStorageResponse:
3138
+ """Get a data storage entry with all details including storage locations and metadata.
3139
+
3140
+ Args:
3141
+ data_storage_id: ID of the data storage entry to retrieve
3142
+
3143
+ Returns:
3144
+ DataStorageResponse with entry details and storage locations
3145
+
3146
+ Raises:
3147
+ DataStorageRetrievalError: If there's an error retrieving the entry
3148
+ """
3149
+ try:
3150
+ response = self.client.get(
3151
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
3152
+ )
3153
+ response.raise_for_status()
3154
+ return DataStorageResponse.model_validate(response.json())
3155
+ except HTTPStatusError as e:
3156
+ self._handle_http_errors(e, "retrieving")
3157
+ except Exception as e:
3158
+ raise DataStorageRetrievalError(
3159
+ f"An unexpected error occurred: {e!r}"
3160
+ ) from e
3161
+
3162
+ @retry(
3163
+ stop=stop_after_attempt(3),
3164
+ wait=wait_exponential(multiplier=1, max=10),
3165
+ retry=retry_if_connection_error,
3166
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3167
+ )
3168
+ async def aget_data_storage_entry(
3169
+ self, data_storage_id: UUID
3170
+ ) -> DataStorageResponse:
3171
+ """Get a data storage entry with all details including storage locations and metadata.
3172
+
3173
+ Args:
3174
+ data_storage_id: ID of the data storage entry to retrieve
3175
+
3176
+ Returns:
3177
+ DataStorageResponse with entry details and storage locations
3178
+
3179
+ Raises:
3180
+ DataStorageRetrievalError: If there's an error retrieving the entry
3181
+ """
3182
+ try:
3183
+ response = await self.async_client.get(
3184
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
3185
+ )
3186
+ response.raise_for_status()
3187
+ return DataStorageResponse.model_validate(response.json())
3188
+ except HTTPStatusError as e:
3189
+ self._handle_http_errors(e, "retrieving")
3190
+ except Exception as e:
3191
+ raise DataStorageRetrievalError(
3192
+ f"An unexpected error occurred: {e!r}"
3193
+ ) from e
3194
+
3195
+ @retry(
3196
+ stop=stop_after_attempt(3),
3197
+ wait=wait_exponential(multiplier=1, max=10),
3198
+ retry=retry_if_connection_error,
3199
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3200
+ )
3201
+ async def adelete_data_storage_entry(self, data_storage_entry_id: UUID) -> None:
3202
+ """Asynchronously delete a data storage entry.
3203
+
3204
+ Args:
3205
+ data_storage_entry_id: UUID of the data storage entry to delete
3206
+
3207
+ Raises:
3208
+ DataStorageError: If there's an error deleting the data storage entry
3209
+ """
3210
+ try:
3211
+ await self.async_client.delete(
3212
+ f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
3213
+ )
3214
+ except HTTPStatusError as e:
3215
+ self._handle_http_errors(e, "deleting")
3216
+ except Exception as e:
3217
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3218
+
3219
+ @retry(
3220
+ stop=stop_after_attempt(3),
3221
+ wait=wait_exponential(multiplier=1, max=10),
3222
+ retry=retry_if_connection_error,
3223
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3224
+ )
3225
+ def delete_data_storage_entry(self, data_storage_entry_id: UUID) -> None:
3226
+ """Delete a data storage entry.
3227
+
3228
+ Args:
3229
+ data_storage_entry_id: UUID of the data storage entry to delete
3230
+
3231
+ Raises:
3232
+ DataStorageError: If there's an error deleting the data storage entry
3233
+ """
3234
+ try:
3235
+ self.client.delete(
3236
+ f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
3237
+ )
3238
+ except HTTPStatusError as e:
3239
+ self._handle_http_errors(e, "deleting")
3240
+ except Exception as e:
3241
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3242
+
3243
+ @retry(
3244
+ stop=stop_after_attempt(3),
3245
+ wait=wait_exponential(multiplier=1, max=10),
3246
+ retry=retry_if_connection_error,
3247
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3248
+ )
3249
+ async def aupdate_entry_permissions(
3250
+ self,
3251
+ data_storage_id: UUID,
3252
+ share_status: ShareStatus,
3253
+ permitted_accessors: PermittedAccessors,
3254
+ ) -> DataStorageResponse:
3255
+ """Update the permissions of a data storage entry.
3256
+
3257
+ Args:
3258
+ data_storage_id: UUID of the data storage entry to update
3259
+ share_status: Share status to set
3260
+ permitted_accessors: Permitted accessors to set
3261
+
3262
+ Returns:
3263
+ DataStorageResponse with updated entry details and storage locations
3264
+
3265
+ Raises:
3266
+ DataStorageError: If there's an error updating the entry permissions
3267
+ """
3268
+ try:
3269
+ response = await self.async_client.patch(
3270
+ f"/v0.1/data-storage/data-entries/{data_storage_id}",
3271
+ json={
3272
+ "share_status": share_status,
3273
+ "permitted_accessors": permitted_accessors.model_dump(),
3274
+ },
3275
+ )
3276
+ response.raise_for_status()
3277
+ return DataStorageResponse.model_validate(response.json())
3278
+ except HTTPStatusError as e:
3279
+ self._handle_http_errors(e, "updating")
3280
+ except Exception as e:
3281
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3282
+
3283
+ @retry(
3284
+ stop=stop_after_attempt(3),
3285
+ wait=wait_exponential(multiplier=1, max=10),
3286
+ retry=retry_if_connection_error,
3287
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3288
+ )
3289
+ def update_entry_permissions(
3290
+ self,
3291
+ data_storage_id: UUID,
3292
+ share_status: ShareStatus,
3293
+ permitted_accessors: PermittedAccessors,
3294
+ ) -> DataStorageResponse:
3295
+ """Update the permissions of a data storage entry."""
3296
+ try:
3297
+ response = self.client.patch(
3298
+ f"/v0.1/data-storage/data-entries/{data_storage_id}",
3299
+ json={
3300
+ "share_status": share_status,
3301
+ "permitted_accessors": permitted_accessors.model_dump(),
3302
+ },
3303
+ )
3304
+ response.raise_for_status()
3305
+ return DataStorageResponse.model_validate(response.json())
3306
+ except HTTPStatusError as e:
3307
+ self._handle_http_errors(e, "updating")
3308
+ except Exception as e:
3309
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e