edison-client 0.6.8.dev103__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3280 @@
1
+ import asyncio
2
+ import fnmatch
3
+ import json
4
+ import logging
5
+ import shutil
6
+ import tempfile
7
+ import zipfile
8
+ from os import PathLike
9
+ from pathlib import Path
10
+ from typing import Any, NoReturn
11
+ from uuid import UUID
12
+
13
+ import aiofiles
14
+ import aiohttp
15
+ import requests as requests_lib
16
+ from google.resumable_media import requests as resumable_requests
17
+ from httpx import AsyncClient, Client, HTTPStatusError, codes
18
+ from lmi.utils import gather_with_concurrency
19
+ from pydantic import HttpUrl
20
+ from requests.adapters import HTTPAdapter
21
+ from tenacity import (
22
+ before_sleep_log,
23
+ retry,
24
+ stop_after_attempt,
25
+ wait_exponential,
26
+ )
27
+ from tqdm import tqdm
28
+ from urllib3.util.retry import Retry
29
+
30
+ from edison_client.models.data_storage_methods import (
31
+ CreateDatasetPayload,
32
+ DataContentType,
33
+ DataStorageLocationPayload,
34
+ DataStorageRequestPayload,
35
+ DataStorageResponse,
36
+ DataStorageType,
37
+ DirectoryManifest,
38
+ GetDatasetAndEntriesResponse,
39
+ ManifestEntry,
40
+ PermittedAccessors,
41
+ RawFetchResponse,
42
+ ShareStatus,
43
+ )
44
+ from edison_client.models.rest import (
45
+ DataStorageSearchPayload,
46
+ FilterLogic,
47
+ SearchCriterion,
48
+ )
49
+ from edison_client.utils.general import retry_if_connection_error
50
+
51
+ # this is only required if they're using a yaml manifest
52
+ try:
53
+ import yaml
54
+ except ImportError:
55
+ yaml = None # type: ignore[assignment]
56
+
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+ # TODO: pdf support, unsure what package we want to use
61
+ SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT = ["txt", "md", "csv", "json", "yaml", "yml"]
62
+ CHUNK_SIZE = 8 * 1024 * 1024 # 8MB
63
+ MAX_RETRIES = 3
64
+ SMALL_FILE_THRESHOLD_BYTES = 10 * 1024 * 1024 # 10MB
65
+ HTTP_RESUME_INCOMPLETE = 308
66
+ INITIATE_HEADERS = {
67
+ "Content-Type": "application/octet-stream",
68
+ "x-goog-resumable": "start",
69
+ "Content-Length": "0",
70
+ }
71
+ DOWNLOAD_CONCURRENCY = 3
72
+
73
+
74
+ def _should_ignore_file(
75
+ file_path: Path | PathLike,
76
+ base_path: Path | PathLike,
77
+ ignore_patterns: list[str] | None = None,
78
+ ) -> bool:
79
+ """Check if a file should be ignored based on ignore patterns.
80
+
81
+ Args:
82
+ file_path: Path to the file to check
83
+ base_path: Base directory path
84
+ ignore_patterns: List of ignore patterns (supports gitignore-style patterns)
85
+
86
+ Returns:
87
+ True if file should be ignored
88
+ """
89
+ if not ignore_patterns:
90
+ return False
91
+
92
+ try:
93
+ file_path = Path(file_path)
94
+ base_path = Path(base_path)
95
+ rel_path = file_path.relative_to(base_path)
96
+ rel_path_str = str(rel_path)
97
+
98
+ for pattern in ignore_patterns:
99
+ pattern = pattern.strip()
100
+ if not pattern or pattern.startswith("#"):
101
+ continue
102
+
103
+ is_absolute_match = pattern.startswith("/") and rel_path_str.startswith(
104
+ pattern[1:]
105
+ )
106
+ is_nested_match = "/" in pattern and pattern in rel_path_str
107
+ is_name_match = fnmatch.fnmatch(file_path.name, pattern)
108
+ is_part_match = pattern in rel_path.parts
109
+
110
+ if is_absolute_match or is_nested_match or is_name_match or is_part_match:
111
+ return True
112
+
113
+ except ValueError:
114
+ pass
115
+
116
+ return False
117
+
118
+
119
+ def _read_ignore_file(dir_path: Path, ignore_filename: str = ".gitignore") -> list[str]:
120
+ """Read ignore patterns from a file in the directory.
121
+
122
+ Args:
123
+ dir_path: Directory to look for ignore file
124
+ ignore_filename: Name of ignore file to read
125
+
126
+ Returns:
127
+ List of ignore patterns
128
+ """
129
+ ignore_file = dir_path / ignore_filename
130
+ if ignore_file.exists():
131
+ try:
132
+ with open(ignore_file, encoding="utf-8") as f:
133
+ return [line.strip() for line in f]
134
+ except Exception as e:
135
+ logger.warning(f"Failed to read {ignore_filename}: {e}")
136
+ return []
137
+ else:
138
+ return []
139
+
140
+
141
+ def _collect_ignore_patterns(
142
+ dir_path: Path,
143
+ ignore_patterns: list[str] | None = None,
144
+ ignore_filename: str = ".gitignore",
145
+ ) -> list[str]:
146
+ """Collect all ignore patterns from multiple sources.
147
+
148
+ Args:
149
+ dir_path: Directory to check for ignore files
150
+ ignore_patterns: Explicit ignore patterns
151
+ ignore_filename: Name of ignore file to read from directory
152
+
153
+ Returns:
154
+ Combined list of ignore patterns
155
+ """
156
+ all_ignore_patterns = ignore_patterns or []
157
+ file_patterns = _read_ignore_file(dir_path, ignore_filename)
158
+ all_ignore_patterns.extend(file_patterns)
159
+
160
+ default_ignores = [".git", "__pycache__", "*.pyc", ".DS_Store", "node_modules"]
161
+ all_ignore_patterns.extend(default_ignores)
162
+
163
+ return all_ignore_patterns
164
+
165
+
166
+ def _create_directory_zip(
167
+ dir_path: Path,
168
+ zip_path: Path,
169
+ ignore_patterns: list[str] | None = None,
170
+ ignore_filename: str = ".gitignore",
171
+ ) -> int:
172
+ """Create a zip file from a directory with ignore patterns.
173
+
174
+ Args:
175
+ dir_path: Directory to zip
176
+ zip_path: Output zip file path
177
+ ignore_patterns: Explicit ignore patterns
178
+ ignore_filename: Name of ignore file to read from directory
179
+
180
+ Returns:
181
+ Size of created zip file in bytes
182
+ """
183
+ all_ignore_patterns = _collect_ignore_patterns(
184
+ dir_path, ignore_patterns, ignore_filename
185
+ )
186
+
187
+ logger.debug(f"Creating zip with ignore patterns: {all_ignore_patterns}")
188
+
189
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
190
+ for file_path in dir_path.rglob("*"):
191
+ if file_path.is_file() and not _should_ignore_file(
192
+ file_path, dir_path, all_ignore_patterns
193
+ ):
194
+ arcname = file_path.relative_to(dir_path)
195
+ zipf.write(file_path, arcname)
196
+ logger.debug(f"Added to zip: {arcname}")
197
+
198
+ zip_size = zip_path.stat().st_size
199
+ logger.debug(f"Created zip file {zip_path} with size {zip_size:,} bytes")
200
+ return zip_size
201
+
202
+
203
+ def _should_send_as_text_content(file_path: Path, file_size: int) -> bool:
204
+ """Check if a file should be sent as text content instead of file upload.
205
+
206
+ Args:
207
+ file_path: Path to the file
208
+ file_size: Size of file in bytes
209
+
210
+ Returns:
211
+ True if file should be sent as text content
212
+ """
213
+ # small files can be treated as raw text
214
+ if file_size >= SMALL_FILE_THRESHOLD_BYTES:
215
+ return False
216
+
217
+ file_extension = file_path.suffix.lower().lstrip(".")
218
+ return file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT
219
+
220
+
221
+ def _extract_text_from_file(file_path: Path) -> str | None:
222
+ """Extract text content from a file.
223
+
224
+ Args:
225
+ file_path: Path to the file
226
+
227
+ Returns:
228
+ Extracted text content or None if extraction failed
229
+ """
230
+ file_extension = file_path.suffix.lower().lstrip(".")
231
+
232
+ if file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT:
233
+ try:
234
+ return file_path.read_text(encoding="utf-8")
235
+ except Exception as e:
236
+ logger.warning(f"Failed to extract text from {file_path}: {e}")
237
+ return None
238
+ else:
239
+ return None
240
+
241
+
242
+ def _setup_upload_progress(file_path: Path, file_size: int, progress_bar: tqdm) -> None:
243
+ """Common setup for upload progress tracking."""
244
+ logger.debug(
245
+ f"Starting resumable upload for file: {file_path} (size: {file_size:,} bytes)"
246
+ )
247
+ progress_bar.set_description(f"Uploading {file_path.name}")
248
+ progress_bar.refresh()
249
+
250
+
251
+ async def _initiate_resumable_session(
252
+ session: aiohttp.ClientSession, signed_url: str
253
+ ) -> str:
254
+ """Initiate resumable upload session and return session URI."""
255
+ logger.debug("Initiating resumable upload session")
256
+ async with session.post(signed_url, headers=INITIATE_HEADERS) as initiate_response:
257
+ if initiate_response.status not in {200, 201}:
258
+ error_text = await initiate_response.text()
259
+ logger.error(
260
+ f"Failed to initiate resumable session: {initiate_response.status}"
261
+ )
262
+ logger.error(f"Response: {error_text}")
263
+ initiate_response.raise_for_status()
264
+
265
+ return _validate_session_uri(initiate_response.headers.get("location"))
266
+
267
+
268
+ # TODO: temp
269
+ def _log_upload_debug(signed_url: str) -> None:
270
+ """Common debug logging for uploads."""
271
+ logger.debug(f"Signed URL: {signed_url[:100]}...")
272
+
273
+
274
+ # TODO: temp
275
+ def _validate_session_uri(session_uri: str | None) -> str:
276
+ """Validate and return session URI or raise exception."""
277
+ if not session_uri:
278
+ raise DataStorageError(
279
+ "No session URI returned from resumable upload initiation"
280
+ )
281
+ logger.debug(f"Resumable session initiated. Session URI: {session_uri[:100]}...")
282
+ return session_uri
283
+
284
+
285
+ async def _upload_chunk_with_retry(
286
+ session: aiohttp.ClientSession,
287
+ session_uri: str,
288
+ chunk_data: bytes,
289
+ range_start: int,
290
+ file_size: int,
291
+ progress_bar: tqdm,
292
+ ) -> int:
293
+ """Upload a single chunk with retry logic."""
294
+ range_end = range_start + len(chunk_data) - 1
295
+ chunk_headers = {
296
+ "Content-Type": "application/octet-stream",
297
+ "Content-Length": str(len(chunk_data)),
298
+ "Content-Range": f"bytes {range_start}-{range_end}/{file_size}",
299
+ }
300
+
301
+ for attempt in range(MAX_RETRIES):
302
+ try:
303
+ async with session.put(
304
+ session_uri, data=chunk_data, headers=chunk_headers
305
+ ) as chunk_response:
306
+ if chunk_response.status == HTTP_RESUME_INCOMPLETE:
307
+ progress_bar.update(len(chunk_data))
308
+ logger.debug(f"Uploaded chunk: {range_end + 1}/{file_size} bytes")
309
+ return len(chunk_data)
310
+ if chunk_response.status in {200, 201}:
311
+ progress_bar.update(len(chunk_data))
312
+ logger.debug(
313
+ f"Upload completed successfully. Final response: {chunk_response.status}"
314
+ )
315
+ return len(chunk_data)
316
+
317
+ error_text = await chunk_response.text()
318
+ logger.warning(
319
+ f"Chunk upload failed (attempt {attempt + 1}/{MAX_RETRIES}): {chunk_response.status}"
320
+ )
321
+ logger.warning(f"Response: {error_text}")
322
+ if attempt == MAX_RETRIES - 1:
323
+ chunk_response.raise_for_status()
324
+
325
+ except (TimeoutError, aiohttp.ClientError) as e:
326
+ logger.warning(
327
+ f"Chunk upload error (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
328
+ )
329
+ if attempt == MAX_RETRIES - 1:
330
+ raise
331
+ await asyncio.sleep(2**attempt)
332
+
333
+ return 0
334
+
335
+
336
+ async def _aupload_file_with_progress(
337
+ signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
338
+ ) -> None:
339
+ """Upload a file asynchronously using aiohttp with signed URL initiation."""
340
+ _setup_upload_progress(file_path, file_size, progress_bar)
341
+ _log_upload_debug(signed_url)
342
+
343
+ try:
344
+ retry_config = aiohttp.ClientTimeout(
345
+ total=max(600.0, file_size / (512 * 1024)), connect=30, sock_read=30
346
+ )
347
+ connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
348
+
349
+ async with aiohttp.ClientSession(
350
+ connector=connector, timeout=retry_config
351
+ ) as session:
352
+ session_uri = await _initiate_resumable_session(session, signed_url)
353
+
354
+ async with aiofiles.open(file_path, "rb") as file_obj:
355
+ bytes_uploaded = 0
356
+
357
+ while bytes_uploaded < file_size:
358
+ remaining = file_size - bytes_uploaded
359
+ current_chunk_size = min(CHUNK_SIZE, remaining)
360
+ chunk_data = await file_obj.read(current_chunk_size)
361
+
362
+ if not chunk_data:
363
+ break
364
+
365
+ uploaded_bytes = await _upload_chunk_with_retry(
366
+ session,
367
+ session_uri,
368
+ chunk_data,
369
+ bytes_uploaded,
370
+ file_size,
371
+ progress_bar,
372
+ )
373
+ bytes_uploaded += uploaded_bytes
374
+
375
+ if bytes_uploaded >= file_size:
376
+ break
377
+
378
+ logger.debug("Upload completed successfully")
379
+
380
+ except Exception as e:
381
+ logger.error(f"Async resumable upload error: {type(e).__name__}: {e}")
382
+ raise
383
+
384
+
385
+ def _upload_file_with_progress(
386
+ signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
387
+ ) -> None:
388
+ """Upload a file synchronously using google.resumable_media with signed URL initiation."""
389
+ _setup_upload_progress(file_path, file_size, progress_bar)
390
+ _log_upload_debug(signed_url)
391
+
392
+ try:
393
+ session = requests_lib.Session()
394
+ retry_strategy = Retry(
395
+ total=MAX_RETRIES,
396
+ backoff_factor=2,
397
+ status_forcelist=[429, 500, 502, 503, 504],
398
+ allowed_methods=["POST", "PUT", "PATCH"],
399
+ )
400
+ adapter = HTTPAdapter(max_retries=retry_strategy)
401
+ session.mount("http://", adapter)
402
+ session.mount("https://", adapter)
403
+
404
+ logger.debug("Initiating resumable upload session")
405
+ initiate_response = session.post(
406
+ signed_url, headers=INITIATE_HEADERS, timeout=30
407
+ )
408
+
409
+ if initiate_response.status_code not in {200, 201}:
410
+ logger.error(
411
+ f"Failed to initiate resumable session: {initiate_response.status_code}"
412
+ )
413
+ logger.error(f"Response: {initiate_response.text}")
414
+ initiate_response.raise_for_status()
415
+
416
+ session_uri = _validate_session_uri(initiate_response.headers.get("location"))
417
+
418
+ with open(file_path, "rb") as file_obj:
419
+ upload = resumable_requests.ResumableUpload(
420
+ upload_url=signed_url, chunk_size=CHUNK_SIZE
421
+ )
422
+
423
+ upload._resumable_url = session_uri
424
+ upload._stream = file_obj
425
+ upload._total_bytes = file_size
426
+
427
+ wrapped_file = ProgressWrapper(file_obj, progress_bar)
428
+ upload._stream = wrapped_file
429
+
430
+ while not upload.finished:
431
+ try:
432
+ upload.transmit_next_chunk(session)
433
+ except Exception as e:
434
+ logger.error(f"Chunk upload failed: {e}")
435
+ raise
436
+
437
+ logger.debug("Upload completed successfully using resumable_media library")
438
+
439
+ except Exception as e:
440
+ logger.error(f"Sync resumable upload error: {type(e).__name__}: {e}")
441
+ raise
442
+
443
+
444
+ class RestClientError(Exception):
445
+ """Base exception for REST client errors."""
446
+
447
+
448
+ class DataStorageError(RestClientError):
449
+ """Base exception for data storage operations."""
450
+
451
+
452
+ class DataStorageCreationError(DataStorageError):
453
+ """Raised when there's an error creating a data storage entry."""
454
+
455
+
456
+ class DataStorageRetrievalError(DataStorageError):
457
+ """Raised when there's an error retrieving a data storage entry."""
458
+
459
+
460
+ class ProgressWrapper:
461
+ """Common progress wrapper for file uploads."""
462
+
463
+ def __init__(self, file_obj, progress_bar):
464
+ self.file_obj = file_obj
465
+ self.progress_bar = progress_bar
466
+ self.bytes_read = 0
467
+
468
+ def read(self, size=-1):
469
+ data = self.file_obj.read(size)
470
+ if data:
471
+ self.bytes_read += len(data)
472
+ current_pos = self.file_obj.tell()
473
+ if current_pos > self.progress_bar.n:
474
+ self.progress_bar.update(current_pos - self.progress_bar.n)
475
+ return data
476
+
477
+ def seek(self, offset, whence=0):
478
+ return self.file_obj.seek(offset, whence)
479
+
480
+ def tell(self):
481
+ return self.file_obj.tell()
482
+
483
+
484
+ class DataStorageMethods:
485
+ """Data storage methods for RestClient.
486
+
487
+ This class contains methods for interacting with the data storage API endpoints.
488
+ """
489
+
490
+ # needed for mypy `NoReturn`
491
+ def _handle_http_errors(self, e: HTTPStatusError, operation: str) -> NoReturn:
492
+ """Handle common HTTP errors for data storage operations."""
493
+ if e.response.status_code == codes.FORBIDDEN:
494
+ raise DataStorageError(
495
+ f"Error {operation} data storage entry, not authorized"
496
+ ) from e
497
+ if e.response.status_code == codes.UNPROCESSABLE_ENTITY:
498
+ raise DataStorageError(f"Invalid request payload: {e.response.text}") from e
499
+ raise DataStorageError(
500
+ f"Error {operation} data storage entry: {e.response.status_code} - {e.response.text}"
501
+ ) from e
502
+
503
+ def _validate_file_path(self, file_path: str | Path) -> Path:
504
+ """Validate file path exists and return Path object."""
505
+ file_path = Path(file_path)
506
+ if not file_path.exists():
507
+ raise DataStorageError(f"File or directory not found: {file_path}")
508
+ return file_path
509
+
510
+ def _build_zip_path(
511
+ self, name: str, path_override: str | Path | None
512
+ ) -> str | Path:
513
+ """Build GCS path for zip file."""
514
+ zip_filename = name if name.endswith(".zip") else f"{name}.zip"
515
+ if path_override:
516
+ if isinstance(path_override, str):
517
+ return f"{path_override.rstrip('/')}/{zip_filename}"
518
+ return path_override / zip_filename
519
+ return zip_filename
520
+
521
+ # TODO: methods in here need to be moved to fh tools
522
+ # =====================================
523
+ def _is_zip_file(self, file_path: Path) -> bool:
524
+ """Check if a file is a zip file by checking its magic bytes."""
525
+ try:
526
+ with open(file_path, "rb") as f:
527
+ magic = f.read(2)
528
+ return magic == b"PK"
529
+ except Exception:
530
+ return False
531
+
532
+ def _extract_zip_file(self, zip_path: Path, extract_to: Path) -> Path:
533
+ """Extract a zip file and return the path to the extracted content.
534
+
535
+ Args:
536
+ zip_path: Path to the zip file
537
+ extract_to: Directory to extract to
538
+
539
+ Returns:
540
+ Path to the extracted content (directory or single file)
541
+ """
542
+ extract_dir = extract_to
543
+ extract_dir.mkdir(exist_ok=True)
544
+
545
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
546
+ zip_ref.extractall(extract_dir)
547
+ extracted_items = list(extract_dir.iterdir())
548
+
549
+ # Delete the zip file
550
+ zip_path.unlink()
551
+
552
+ if len(extracted_items) == 1:
553
+ return extracted_items[0]
554
+ return extract_dir
555
+
556
+ async def _adownload_from_gcs(
557
+ self, signed_url: str, file_name: str | None = None
558
+ ) -> Path:
559
+ """Download file from GCS using signed URL and handle unzipping if needed.
560
+
561
+ Args:
562
+ signed_url: The signed URL to download from
563
+ file_name: The name of the file to download
564
+
565
+ Returns:
566
+ Path to the downloaded file (or unzipped directory if it was a zip)
567
+ """
568
+ file_name = file_name or "downloaded_file"
569
+
570
+ try:
571
+ with tempfile.TemporaryDirectory() as temp_dir_str:
572
+ temp_dir = Path(temp_dir_str)
573
+ temp_file = temp_dir / file_name
574
+
575
+ async with self.async_client.stream("GET", signed_url) as response:
576
+ response.raise_for_status()
577
+
578
+ content_disposition = response.headers.get(
579
+ "content-disposition", ""
580
+ )
581
+ filename = file_name
582
+ if "filename=" in content_disposition:
583
+ filename = content_disposition.split("filename=")[-1].strip('"')
584
+
585
+ if filename != file_name:
586
+ temp_file = temp_dir / filename
587
+
588
+ async with aiofiles.open(temp_file, "wb") as f:
589
+ async for chunk in response.aiter_bytes(chunk_size=8192):
590
+ await f.write(chunk)
591
+
592
+ logger.debug(
593
+ f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
594
+ )
595
+
596
+ if self._is_zip_file(temp_file):
597
+ logger.debug(f"File {temp_file} is a zip file, extracting...")
598
+ extracted_path = self._extract_zip_file(temp_file, temp_dir)
599
+
600
+ final_temp_dir = Path(tempfile.mkdtemp())
601
+ final_path = final_temp_dir / extracted_path.name
602
+
603
+ if extracted_path.is_dir():
604
+ shutil.copytree(extracted_path, final_path)
605
+ else:
606
+ shutil.copy2(extracted_path, final_path)
607
+
608
+ return final_path
609
+ final_temp_dir = Path(tempfile.mkdtemp())
610
+ final_file = final_temp_dir / temp_file.name
611
+ shutil.copy2(temp_file, final_file)
612
+ return final_file
613
+
614
+ except Exception as e:
615
+ raise DataStorageError(f"Failed to download from GCS: {e}") from e
616
+
617
+ def _download_from_gcs(self, signed_url: str, file_name: str | None = None) -> Path:
618
+ """Download file from GCS using signed URL and handle unzipping if needed (sync version).
619
+
620
+ Args:
621
+ signed_url: The signed URL to download from
622
+ file_name: The name of the file to download
623
+ Returns:
624
+ Path to the downloaded file (or unzipped directory if it was a zip)
625
+ """
626
+ file_name = file_name or "downloaded_file"
627
+
628
+ try:
629
+ with tempfile.TemporaryDirectory() as temp_dir_str:
630
+ temp_dir = Path(temp_dir_str)
631
+ temp_file = temp_dir / file_name
632
+
633
+ with requests_lib.get(signed_url, stream=True, timeout=30) as response:
634
+ response.raise_for_status()
635
+
636
+ content_disposition = response.headers.get(
637
+ "content-disposition", ""
638
+ )
639
+ filename = file_name
640
+ if "filename=" in content_disposition:
641
+ filename = content_disposition.split("filename=")[-1].strip('"')
642
+
643
+ if filename != file_name:
644
+ temp_file = temp_dir / filename
645
+
646
+ with open(temp_file, "wb") as f:
647
+ for chunk in response.iter_content(chunk_size=8192):
648
+ f.write(chunk)
649
+
650
+ logger.debug(
651
+ f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
652
+ )
653
+
654
+ if self._is_zip_file(temp_file):
655
+ logger.debug(f"File {temp_file} is a zip file, extracting...")
656
+ extracted_path = self._extract_zip_file(temp_file, temp_dir)
657
+
658
+ final_temp_dir = Path(tempfile.mkdtemp())
659
+ final_path = final_temp_dir / extracted_path.name
660
+
661
+ if extracted_path.is_dir():
662
+ shutil.copytree(extracted_path, final_path)
663
+ else:
664
+ shutil.copy2(extracted_path, final_path)
665
+
666
+ return final_path
667
+ final_temp_dir = Path(tempfile.mkdtemp())
668
+ final_file = final_temp_dir / temp_file.name
669
+ shutil.copy2(temp_file, final_file)
670
+ return final_file
671
+
672
+ except Exception as e:
673
+ raise DataStorageError(f"Failed to download from GCS: {e}") from e
674
+
675
+ def _prepare_single_file_upload(
676
+ self,
677
+ name: str,
678
+ file_path: Path,
679
+ description: str | None,
680
+ file_path_override: str | Path | None,
681
+ dataset_id: UUID | None,
682
+ project_id: UUID | None,
683
+ metadata: dict[str, Any] | None,
684
+ tags: list[str] | None,
685
+ parent_id: UUID | None,
686
+ ) -> tuple[int, DataStorageRequestPayload | None]:
687
+ """Prepare single file for upload, return file size and payload if text content."""
688
+ file_size = file_path.stat().st_size
689
+ logger.debug(
690
+ f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
691
+ )
692
+
693
+ if _should_send_as_text_content(file_path, file_size):
694
+ logger.debug(
695
+ f"Small text file ({file_size:,} bytes) - sending as text content"
696
+ )
697
+ text_content = _extract_text_from_file(file_path)
698
+ if text_content is not None:
699
+ return file_size, DataStorageRequestPayload(
700
+ name=name,
701
+ description=description,
702
+ content=text_content,
703
+ file_path=file_path_override or file_path,
704
+ is_collection=False,
705
+ project_id=project_id,
706
+ metadata=metadata,
707
+ tags=tags,
708
+ dataset_id=dataset_id,
709
+ parent_id=parent_id,
710
+ )
711
+ logger.warning(
712
+ "Could not extract text content, falling back to file upload"
713
+ )
714
+
715
+ return file_size, None
716
+
717
+ def _create_data_storage_entry(
718
+ self, payload: DataStorageRequestPayload
719
+ ) -> DataStorageResponse:
720
+ """Create data storage entry via API (sync version)."""
721
+ response = self.client.post(
722
+ "/v0.1/data-storage/data-entries",
723
+ json=payload.model_dump(mode="json", exclude_none=True),
724
+ )
725
+ response.raise_for_status()
726
+ return DataStorageResponse.model_validate(response.json())
727
+
728
+ async def _acreate_data_storage_entry(
729
+ self, payload: DataStorageRequestPayload
730
+ ) -> DataStorageResponse:
731
+ """Create data storage entry via API (async version)."""
732
+ response = await self.async_client.post(
733
+ "/v0.1/data-storage/data-entries",
734
+ json=payload.model_dump(mode="json", exclude_none=True),
735
+ )
736
+ response.raise_for_status()
737
+ return DataStorageResponse.model_validate(response.json())
738
+
739
+ def _generate_folder_description_from_files(
740
+ self, dir_path: Path, manifest: DirectoryManifest
741
+ ) -> str:
742
+ """Generate folder description by concatenating descriptions of top-level files."""
743
+ descriptions = []
744
+
745
+ # Get top-level files only (not recursive)
746
+ for item in dir_path.iterdir():
747
+ if item.is_file():
748
+ # Try to get description from manifest first
749
+ file_desc = manifest.get_entry_description(item.name)
750
+
751
+ if file_desc:
752
+ descriptions.append(f"{item.name}: {file_desc}")
753
+ else:
754
+ descriptions.append(item.name)
755
+
756
+ if descriptions:
757
+ return f"Directory containing: {', '.join(descriptions)}"
758
+ return f"Directory: {dir_path.name}"
759
+
760
+ def _load_manifest(
761
+ self, dir_path: Path, manifest_filename: str | None
762
+ ) -> DirectoryManifest:
763
+ """Load and parse a manifest file (JSON or YAML) into a structured model."""
764
+ if not manifest_filename:
765
+ return DirectoryManifest()
766
+
767
+ manifest_path = dir_path / manifest_filename
768
+ if not manifest_path.exists():
769
+ logger.error(f"Manifest file not found at {manifest_path}")
770
+ raise DataStorageCreationError(
771
+ f"Manifest file {manifest_filename} not found in directory {dir_path}. Ensure the manifest exists and is correctly named, or do not pass it as an argument."
772
+ )
773
+
774
+ try:
775
+ with open(manifest_path, encoding="utf-8") as f:
776
+ data = {}
777
+ if manifest_filename.lower().endswith(".json"):
778
+ data = json.load(f)
779
+ elif manifest_filename.lower().endswith((".yaml", ".yml")):
780
+ if yaml is None:
781
+ raise ImportError(
782
+ "pyyaml is required to parse .yaml manifest files. "
783
+ "Please install it with `pip install pyyaml`."
784
+ )
785
+ data = yaml.safe_load(f)
786
+ else:
787
+ logger.warning(
788
+ f"Unsupported manifest file extension: {manifest_filename}"
789
+ )
790
+ return DirectoryManifest()
791
+
792
+ return DirectoryManifest.from_dict(data or {})
793
+
794
+ except Exception as e:
795
+ logger.warning(f"Failed to load manifest {manifest_filename}: {e}")
796
+
797
+ return DirectoryManifest()
798
+
799
+ def _upload_data_directory(
800
+ self,
801
+ name: str,
802
+ dir_path: Path,
803
+ description: str | None,
804
+ dir_path_override: str | Path | None = None,
805
+ ignore_patterns: list[str] | None = None,
806
+ ignore_filename: str = ".gitignore",
807
+ project_id: UUID | None = None,
808
+ tags: list[str] | None = None,
809
+ metadata: dict[str, Any] | None = None,
810
+ dataset_id: UUID | None = None,
811
+ parent_id: UUID | None = None,
812
+ ) -> DataStorageResponse:
813
+ """Upload a directory as a single zip file collection.
814
+
815
+ Args:
816
+ name: Name for the directory collection
817
+ dir_path: Path to directory to zip and upload
818
+ description: Description for the collection
819
+ dir_path_override: Optional GCS path for the zip file
820
+ ignore_patterns: List of patterns to ignore when zipping
821
+ ignore_filename: Name of ignore file to read from directory
822
+ project_id: ID of the project this data storage entry belongs to
823
+ tags: List of tags to associate with the data storage entry
824
+ metadata: Optional metadata for the data storage entry
825
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
826
+ parent_id: Optional parent ID for the data storage entry
827
+
828
+ Returns:
829
+ DataStorageResponse for the uploaded zip file
830
+ """
831
+ logger.debug(f"Uploading directory as zip: {dir_path}")
832
+
833
+ with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
834
+ temp_zip_path = Path(temp_file.name)
835
+
836
+ zip_size = _create_directory_zip(
837
+ dir_path, temp_zip_path, ignore_patterns, ignore_filename
838
+ )
839
+
840
+ zip_gcs_path = self._build_zip_path(name, dir_path_override)
841
+ payload = DataStorageRequestPayload(
842
+ name=name,
843
+ description=description,
844
+ file_path=zip_gcs_path,
845
+ is_collection=True,
846
+ project_id=project_id,
847
+ tags=tags,
848
+ metadata=metadata,
849
+ dataset_id=dataset_id,
850
+ parent_id=parent_id,
851
+ )
852
+
853
+ logger.debug(
854
+ f"Creating data storage entry for zip: {payload.model_dump(exclude_none=True)}"
855
+ )
856
+ data_storage_response = self._create_data_storage_entry(payload)
857
+
858
+ for storage_location in data_storage_response.storage_locations:
859
+ if not storage_location.storage_config.signed_url:
860
+ raise DataStorageCreationError(
861
+ "No signed URL returned for zip upload"
862
+ )
863
+
864
+ with tqdm(
865
+ total=zip_size,
866
+ unit="B",
867
+ unit_scale=True,
868
+ unit_divisor=1024,
869
+ desc=f"Uploading {dir_path.name} (zipped)",
870
+ miniters=1,
871
+ mininterval=0.1,
872
+ ) as pbar:
873
+ _upload_file_with_progress(
874
+ storage_location.storage_config.signed_url,
875
+ temp_zip_path,
876
+ pbar,
877
+ zip_size,
878
+ )
879
+
880
+ status_response = self.client.patch(
881
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
882
+ json={"status": "active"},
883
+ )
884
+ status_response.raise_for_status()
885
+
886
+ logger.debug(
887
+ f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
888
+ )
889
+ return DataStorageResponse.model_validate(status_response.json())
890
+
891
+ async def _aupload_data_directory(
892
+ self,
893
+ name: str,
894
+ dir_path: Path,
895
+ description: str | None,
896
+ dir_path_override: str | Path | None = None,
897
+ ignore_patterns: list[str] | None = None,
898
+ ignore_filename: str = ".gitignore",
899
+ project_id: UUID | None = None,
900
+ tags: list[str] | None = None,
901
+ metadata: dict[str, Any] | None = None,
902
+ dataset_id: UUID | None = None,
903
+ parent_id: UUID | None = None,
904
+ ) -> DataStorageResponse:
905
+ """Asynchronously upload a directory as a single zip file.
906
+
907
+ Args:
908
+ name: Name for the directory collection
909
+ dir_path: Path to directory to zip and upload
910
+ description: Description for the collection
911
+ dir_path_override: Optional GCS path for the zip file
912
+ ignore_patterns: List of patterns to ignore when zipping
913
+ ignore_filename: Name of ignore file to read from directory
914
+ project_id: ID of the project this data storage entry belongs to
915
+ tags: List of tags to associate with the data storage entry
916
+ metadata: Optional metadata for the data storage entry
917
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
918
+ parent_id: Optional parent ID for the data storage entry
919
+
920
+ Returns:
921
+ DataStorageResponse for the uploaded zip file
922
+ """
923
+ logger.debug(f"Async uploading directory as zip: {dir_path}")
924
+
925
+ with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
926
+ temp_zip_path = Path(temp_file.name)
927
+
928
+ zip_size = _create_directory_zip(
929
+ dir_path, temp_zip_path, ignore_patterns, ignore_filename
930
+ )
931
+
932
+ zip_gcs_path = self._build_zip_path(name, dir_path_override)
933
+ payload = DataStorageRequestPayload(
934
+ name=name,
935
+ description=description,
936
+ file_path=zip_gcs_path,
937
+ is_collection=True,
938
+ project_id=project_id,
939
+ tags=tags,
940
+ metadata=metadata,
941
+ dataset_id=dataset_id,
942
+ parent_id=parent_id,
943
+ )
944
+
945
+ data_storage_response = await self._acreate_data_storage_entry(payload)
946
+
947
+ for storage_location in data_storage_response.storage_locations:
948
+ if not storage_location.storage_config.signed_url:
949
+ raise DataStorageCreationError(
950
+ "No signed URL returned for zip upload"
951
+ )
952
+
953
+ with tqdm(
954
+ total=zip_size,
955
+ unit="B",
956
+ unit_scale=True,
957
+ unit_divisor=1024,
958
+ desc=f"Uploading {dir_path.name} (zipped)",
959
+ miniters=1,
960
+ mininterval=0.1,
961
+ ) as pbar:
962
+ await _aupload_file_with_progress(
963
+ storage_location.storage_config.signed_url,
964
+ temp_zip_path,
965
+ pbar,
966
+ zip_size,
967
+ )
968
+
969
+ status_response = await self.async_client.patch(
970
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
971
+ json={"status": "active"},
972
+ )
973
+ status_response.raise_for_status()
974
+
975
+ logger.debug(
976
+ f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
977
+ )
978
+ return DataStorageResponse.model_validate(status_response.json())
979
+
980
+ def _upload_data_single_file(
981
+ self,
982
+ name: str,
983
+ file_path: Path,
984
+ description: str | None,
985
+ file_path_override: str | Path | None = None,
986
+ project_id: UUID | None = None,
987
+ metadata: dict[str, Any] | None = None,
988
+ tags: list[str] | None = None,
989
+ dataset_id: UUID | None = None,
990
+ parent_id: UUID | None = None,
991
+ ) -> DataStorageResponse:
992
+ """Upload a single file."""
993
+ file_size = file_path.stat().st_size
994
+ logger.debug(
995
+ f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
996
+ )
997
+
998
+ if _should_send_as_text_content(file_path, file_size):
999
+ logger.debug(
1000
+ f"Small text file ({file_size:,} bytes) - sending as text content"
1001
+ )
1002
+
1003
+ text_content = _extract_text_from_file(file_path)
1004
+ if text_content is not None:
1005
+ payload = DataStorageRequestPayload(
1006
+ name=name,
1007
+ description=description,
1008
+ content=text_content,
1009
+ file_path=file_path_override or file_path,
1010
+ is_collection=False,
1011
+ project_id=project_id,
1012
+ metadata=metadata,
1013
+ tags=tags,
1014
+ dataset_id=dataset_id,
1015
+ parent_id=parent_id,
1016
+ )
1017
+
1018
+ logger.debug("Sending file as text content")
1019
+ return self._create_data_storage_entry(payload)
1020
+ logger.warning(
1021
+ "Could not extract text content, falling back to file upload"
1022
+ )
1023
+
1024
+ logger.debug(
1025
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1026
+ )
1027
+ payload = DataStorageRequestPayload(
1028
+ name=name,
1029
+ description=description,
1030
+ file_path=file_path_override or file_path,
1031
+ is_collection=False,
1032
+ project_id=project_id,
1033
+ metadata=metadata,
1034
+ tags=tags,
1035
+ dataset_id=dataset_id,
1036
+ parent_id=parent_id,
1037
+ )
1038
+
1039
+ logger.debug(
1040
+ f"Requesting signed URL with payload: {payload.model_dump(exclude_none=True)}"
1041
+ )
1042
+
1043
+ data_storage_response = self._create_data_storage_entry(payload)
1044
+
1045
+ for storage_location in data_storage_response.storage_locations:
1046
+ if not storage_location.storage_config.signed_url:
1047
+ raise DataStorageCreationError("No signed URL returned from server")
1048
+
1049
+ with tqdm(
1050
+ total=file_size,
1051
+ unit="B",
1052
+ unit_scale=True,
1053
+ unit_divisor=1024,
1054
+ desc=f"Uploading {file_path.name}",
1055
+ miniters=1,
1056
+ mininterval=0.1,
1057
+ ) as pbar:
1058
+ try:
1059
+ _upload_file_with_progress(
1060
+ storage_location.storage_config.signed_url,
1061
+ file_path,
1062
+ pbar,
1063
+ file_size,
1064
+ )
1065
+ logger.debug("File upload to signed URL completed successfully")
1066
+ except Exception as e:
1067
+ logger.error(f"Failed to upload file to signed URL: {e}")
1068
+ raise
1069
+
1070
+ logger.debug("Updating data storage status to active")
1071
+ status_response = self.client.patch(
1072
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1073
+ json={"status": "active"},
1074
+ )
1075
+ status_response.raise_for_status()
1076
+ logger.debug("Data storage status updated successfully")
1077
+
1078
+ return DataStorageResponse.model_validate(status_response.json())
1079
+
1080
+ async def _aupload_data_single_file(
1081
+ self,
1082
+ name: str,
1083
+ file_path: Path,
1084
+ description: str | None,
1085
+ file_path_override: str | Path | None = None,
1086
+ dataset_id: UUID | None = None,
1087
+ project_id: UUID | None = None,
1088
+ metadata: dict[str, Any] | None = None,
1089
+ tags: list[str] | None = None,
1090
+ parent_id: UUID | None = None,
1091
+ ) -> DataStorageResponse:
1092
+ """Asynchronously upload a single file."""
1093
+ file_size, text_payload = self._prepare_single_file_upload(
1094
+ name=name,
1095
+ file_path=file_path,
1096
+ description=description,
1097
+ file_path_override=file_path_override,
1098
+ dataset_id=dataset_id,
1099
+ project_id=project_id,
1100
+ metadata=metadata,
1101
+ tags=tags,
1102
+ parent_id=parent_id,
1103
+ )
1104
+
1105
+ if text_payload:
1106
+ logger.debug("Sending file as text content")
1107
+ text_payload.dataset_id = dataset_id
1108
+ return await self._acreate_data_storage_entry(text_payload)
1109
+
1110
+ logger.debug(
1111
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1112
+ )
1113
+ payload = DataStorageRequestPayload(
1114
+ name=name,
1115
+ description=description,
1116
+ file_path=file_path_override or file_path,
1117
+ is_collection=False,
1118
+ dataset_id=dataset_id,
1119
+ project_id=project_id,
1120
+ metadata=metadata,
1121
+ tags=tags,
1122
+ parent_id=parent_id,
1123
+ )
1124
+
1125
+ data_storage_response = await self._acreate_data_storage_entry(payload)
1126
+
1127
+ for location in data_storage_response.storage_locations:
1128
+ if not location.storage_config.signed_url:
1129
+ raise DataStorageCreationError(
1130
+ f"No signed URL returned from server for location: {location.id}"
1131
+ )
1132
+
1133
+ with tqdm(
1134
+ total=file_size,
1135
+ unit="B",
1136
+ unit_scale=True,
1137
+ unit_divisor=1024,
1138
+ desc=f"Uploading {file_path.name}",
1139
+ miniters=1,
1140
+ mininterval=0.1,
1141
+ leave=False,
1142
+ ) as pbar:
1143
+ await _aupload_file_with_progress(
1144
+ location.storage_config.signed_url, file_path, pbar, file_size
1145
+ )
1146
+
1147
+ status_response = await self.async_client.patch(
1148
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1149
+ json={"status": "active"},
1150
+ )
1151
+ status_response.raise_for_status()
1152
+
1153
+ return DataStorageResponse.model_validate(status_response.json())
1154
+
1155
+ def _upload_data_single_file_with_parent(
1156
+ self,
1157
+ name: str,
1158
+ file_path: Path,
1159
+ description: str | None,
1160
+ file_path_override: str | None,
1161
+ parent_id: UUID | None,
1162
+ dataset_id: UUID | None = None,
1163
+ project_id: UUID | None = None,
1164
+ metadata: dict[str, Any] | None = None,
1165
+ tags: list[str] | None = None,
1166
+ ) -> DataStorageResponse:
1167
+ """Upload a single file with a parent ID (sync version)."""
1168
+ file_size, text_payload = self._prepare_single_file_upload(
1169
+ name=name,
1170
+ file_path=file_path,
1171
+ description=description,
1172
+ file_path_override=file_path_override,
1173
+ dataset_id=dataset_id,
1174
+ project_id=project_id,
1175
+ metadata=metadata,
1176
+ tags=tags,
1177
+ parent_id=parent_id,
1178
+ )
1179
+
1180
+ if text_payload:
1181
+ logger.debug("Sending file as text content with parent_id")
1182
+ text_payload.parent_id = parent_id
1183
+ text_payload.dataset_id = dataset_id
1184
+ text_payload.project_id = project_id
1185
+ return self._create_data_storage_entry(text_payload)
1186
+
1187
+ logger.debug(
1188
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1189
+ )
1190
+ payload = DataStorageRequestPayload(
1191
+ name=name,
1192
+ description=description,
1193
+ file_path=file_path_override or file_path,
1194
+ is_collection=False,
1195
+ parent_id=parent_id,
1196
+ dataset_id=dataset_id,
1197
+ project_id=project_id,
1198
+ metadata=metadata,
1199
+ tags=tags,
1200
+ )
1201
+ data_storage_response = self._create_data_storage_entry(payload)
1202
+
1203
+ for location in data_storage_response.storage_locations:
1204
+ if not location.storage_config.signed_url:
1205
+ raise DataStorageCreationError("No signed URL returned from server")
1206
+
1207
+ with tqdm(
1208
+ total=file_size,
1209
+ unit="B",
1210
+ unit_scale=True,
1211
+ unit_divisor=1024,
1212
+ desc=f"Uploading {file_path.name}",
1213
+ miniters=1,
1214
+ mininterval=0.1,
1215
+ leave=False,
1216
+ ) as pbar:
1217
+ _upload_file_with_progress(
1218
+ location.storage_config.signed_url, file_path, pbar, file_size
1219
+ )
1220
+
1221
+ status_response = self.client.patch(
1222
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1223
+ json={"status": "active"},
1224
+ )
1225
+ status_response.raise_for_status()
1226
+
1227
+ return DataStorageResponse.model_validate(status_response.json())
1228
+
1229
+ def _process_file_item(
1230
+ self,
1231
+ item: Path,
1232
+ dir_manifest: DirectoryManifest,
1233
+ current_parent_id: UUID,
1234
+ dataset_id: UUID | None = None,
1235
+ project_id: UUID | None = None,
1236
+ metadata: dict[str, Any] | None = None,
1237
+ tags: list[str] | None = None,
1238
+ ) -> DataStorageResponse | None:
1239
+ """Process a single file item for upload."""
1240
+ try:
1241
+ manifest_desc = dir_manifest.get_entry_description(item.name)
1242
+ file_description = manifest_desc or f"File: {item.name}"
1243
+
1244
+ logger.debug(
1245
+ f"Processing file {item.name} with description: '{file_description}'"
1246
+ )
1247
+
1248
+ return self._upload_data_single_file_with_parent(
1249
+ name=item.name,
1250
+ file_path=item,
1251
+ description=file_description,
1252
+ file_path_override=None,
1253
+ parent_id=current_parent_id,
1254
+ dataset_id=dataset_id,
1255
+ project_id=project_id,
1256
+ metadata=metadata,
1257
+ tags=tags,
1258
+ )
1259
+ except Exception as e:
1260
+ logger.error(f"Failed to upload file {item}: {e}")
1261
+ return None
1262
+
1263
+ def _upload_directory_hierarchically(
1264
+ self,
1265
+ name: str,
1266
+ dir_path: Path,
1267
+ description: str | None = None,
1268
+ manifest_filename: str | None = None,
1269
+ parent_id: UUID | None = None,
1270
+ ignore_patterns: list[str] | None = None,
1271
+ ignore_filename: str = ".gitignore",
1272
+ base_dir: Path | None = None,
1273
+ dir_manifest: DirectoryManifest | None = None,
1274
+ dataset_id: UUID | None = None,
1275
+ project_id: UUID | None = None,
1276
+ metadata: dict[str, Any] | None = None,
1277
+ tags: list[str] | None = None,
1278
+ ) -> list[DataStorageResponse]:
1279
+ """Upload a directory with single dataset and individual file storage entries."""
1280
+ responses = []
1281
+ if parent_id is None:
1282
+ base_dir = dir_path
1283
+ all_ignore_patterns = _collect_ignore_patterns(
1284
+ base_dir, ignore_patterns, ignore_filename
1285
+ )
1286
+
1287
+ payload = DataStorageRequestPayload(
1288
+ name=name,
1289
+ description=description,
1290
+ parent_id=None,
1291
+ dataset_id=None,
1292
+ is_collection=False,
1293
+ project_id=project_id,
1294
+ metadata=metadata,
1295
+ tags=tags,
1296
+ )
1297
+
1298
+ dir_response = self._create_data_storage_entry(payload)
1299
+ responses.append(dir_response)
1300
+ current_parent_id = dir_response.data_storage.id
1301
+ current_dataset_id = dir_response.data_storage.dataset_id
1302
+
1303
+ dir_manifest = self._load_directory_manifest(
1304
+ manifest_filename, parent_id, dir_path
1305
+ )
1306
+ else:
1307
+ all_ignore_patterns = ignore_patterns or []
1308
+ current_parent_id = parent_id
1309
+ current_dataset_id = dataset_id
1310
+
1311
+ for item in dir_path.iterdir():
1312
+ if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
1313
+ continue
1314
+
1315
+ if item.is_dir():
1316
+ subdir_manifest = DirectoryManifest()
1317
+ if dir_manifest:
1318
+ entry = dir_manifest.entries.get(item.name)
1319
+ if isinstance(entry, DirectoryManifest):
1320
+ subdir_manifest = entry
1321
+ elif isinstance(entry, ManifestEntry):
1322
+ # Convert single entry to manifest
1323
+ subdir_manifest = DirectoryManifest(entries={item.name: entry})
1324
+
1325
+ subdir_description = subdir_manifest.get_entry_description(item.name)
1326
+ if not subdir_description:
1327
+ subdir_description = self._generate_folder_description_from_files(
1328
+ item, subdir_manifest
1329
+ )
1330
+
1331
+ subdir_payload = DataStorageRequestPayload(
1332
+ name=item.name,
1333
+ description=subdir_description,
1334
+ parent_id=current_parent_id,
1335
+ dataset_id=current_dataset_id,
1336
+ is_collection=False,
1337
+ project_id=project_id,
1338
+ metadata=metadata,
1339
+ tags=tags,
1340
+ )
1341
+ subdir_response = self._create_data_storage_entry(subdir_payload)
1342
+ responses.append(subdir_response)
1343
+
1344
+ subdir_responses = self._upload_directory_hierarchically(
1345
+ name=item.name,
1346
+ dir_path=item,
1347
+ description=None,
1348
+ manifest_filename=None,
1349
+ parent_id=subdir_response.data_storage.id,
1350
+ ignore_patterns=all_ignore_patterns,
1351
+ ignore_filename=ignore_filename,
1352
+ base_dir=base_dir,
1353
+ dir_manifest=subdir_manifest,
1354
+ dataset_id=current_dataset_id,
1355
+ project_id=project_id,
1356
+ metadata=metadata,
1357
+ tags=tags,
1358
+ )
1359
+ responses.extend(subdir_responses)
1360
+ elif item.is_file():
1361
+ file_response = self._process_file_item(
1362
+ item=item,
1363
+ dir_manifest=dir_manifest or DirectoryManifest(),
1364
+ current_parent_id=current_parent_id,
1365
+ dataset_id=current_dataset_id,
1366
+ project_id=project_id,
1367
+ metadata=metadata,
1368
+ tags=tags,
1369
+ )
1370
+ if file_response:
1371
+ responses.append(file_response)
1372
+
1373
+ return responses
1374
+
1375
+ def _load_directory_manifest(
1376
+ self,
1377
+ manifest_filename: str | None,
1378
+ parent_id: UUID | None,
1379
+ dir_path: Path,
1380
+ ) -> DirectoryManifest:
1381
+ """Load directory manifest if available."""
1382
+ if manifest_filename and not parent_id:
1383
+ manifest_data = self._load_manifest(Path.cwd(), manifest_filename)
1384
+ dir_name = dir_path.name
1385
+ logger.debug(
1386
+ f"Loaded manifest entries: {list(manifest_data.entries.keys())}"
1387
+ )
1388
+ logger.debug(
1389
+ f"Looking for manifest entry with directory name: '{dir_name}'"
1390
+ )
1391
+
1392
+ entry = manifest_data.entries.get(dir_name)
1393
+ if isinstance(entry, DirectoryManifest):
1394
+ return entry
1395
+ if isinstance(entry, ManifestEntry):
1396
+ return DirectoryManifest(entries={dir_name: entry})
1397
+ logger.debug(
1398
+ f"No manifest entry found for '{dir_name}', available keys: {list(manifest_data.entries.keys())}"
1399
+ )
1400
+ return DirectoryManifest()
1401
+ return DirectoryManifest()
1402
+
1403
+ async def _aupload_data_single_file_with_parent(
1404
+ self,
1405
+ name: str,
1406
+ file_path: Path,
1407
+ description: str | None,
1408
+ file_path_override: str | None,
1409
+ parent_id: UUID | None,
1410
+ dataset_id: UUID | None = None,
1411
+ project_id: UUID | None = None,
1412
+ metadata: dict[str, Any] | None = None,
1413
+ tags: list[str] | None = None,
1414
+ ) -> DataStorageResponse:
1415
+ """Asynchronously upload a single file with a parent ID."""
1416
+ file_size, text_payload = self._prepare_single_file_upload(
1417
+ name=name,
1418
+ file_path=file_path,
1419
+ description=description,
1420
+ file_path_override=file_path_override,
1421
+ dataset_id=dataset_id,
1422
+ project_id=project_id,
1423
+ metadata=metadata,
1424
+ tags=tags,
1425
+ parent_id=parent_id,
1426
+ )
1427
+
1428
+ if text_payload:
1429
+ logger.debug("Sending file as text content with parent_id")
1430
+ text_payload.parent_id = parent_id
1431
+ text_payload.dataset_id = dataset_id
1432
+ text_payload.project_id = project_id
1433
+ return await self._acreate_data_storage_entry(text_payload)
1434
+
1435
+ logger.debug(
1436
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1437
+ )
1438
+ payload = DataStorageRequestPayload(
1439
+ name=name,
1440
+ description=description,
1441
+ file_path=file_path_override or file_path,
1442
+ is_collection=False,
1443
+ parent_id=parent_id,
1444
+ dataset_id=dataset_id,
1445
+ project_id=project_id,
1446
+ metadata=metadata,
1447
+ tags=tags,
1448
+ )
1449
+ data_storage_response = await self._acreate_data_storage_entry(payload)
1450
+
1451
+ storage_location = data_storage_response.storage_locations[0]
1452
+
1453
+ if not storage_location.storage_config.signed_url:
1454
+ raise DataStorageCreationError("No signed URL returned from server")
1455
+
1456
+ with tqdm(
1457
+ total=file_size,
1458
+ unit="B",
1459
+ unit_scale=True,
1460
+ unit_divisor=1024,
1461
+ desc=f"Uploading {file_path.name}",
1462
+ miniters=1,
1463
+ mininterval=0.1,
1464
+ ) as pbar:
1465
+ await _aupload_file_with_progress(
1466
+ storage_location.storage_config.signed_url, file_path, pbar, file_size
1467
+ )
1468
+
1469
+ status_response = await self.async_client.patch(
1470
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1471
+ json={"status": "active"},
1472
+ )
1473
+ status_response.raise_for_status()
1474
+
1475
+ return DataStorageResponse.model_validate(status_response.json())
1476
+
1477
+ async def _aprocess_file_item(
1478
+ self,
1479
+ item: Path,
1480
+ dir_manifest: DirectoryManifest,
1481
+ current_parent_id: UUID,
1482
+ dataset_id: UUID | None = None,
1483
+ project_id: UUID | None = None,
1484
+ ) -> DataStorageResponse | None:
1485
+ """Asynchronously process a single file item for upload."""
1486
+ try:
1487
+ manifest_desc = dir_manifest.get_entry_description(item.name)
1488
+ file_description = manifest_desc or f"File: {item.name}"
1489
+
1490
+ logger.debug(
1491
+ f"Processing file {item.name} with description: '{file_description}'"
1492
+ )
1493
+
1494
+ return await self._aupload_data_single_file_with_parent(
1495
+ name=item.name,
1496
+ file_path=item,
1497
+ description=file_description,
1498
+ file_path_override=None,
1499
+ parent_id=current_parent_id,
1500
+ dataset_id=dataset_id,
1501
+ project_id=project_id,
1502
+ )
1503
+ except Exception as e:
1504
+ logger.error(f"Failed to upload file {item}: {e}")
1505
+ return None
1506
+
1507
+ async def _aupload_directory_hierarchically(
1508
+ self,
1509
+ name: str,
1510
+ dir_path: Path,
1511
+ description: str | None = None,
1512
+ manifest_filename: str | None = None,
1513
+ parent_id: UUID | None = None,
1514
+ ignore_patterns: list[str] | None = None,
1515
+ ignore_filename: str = ".gitignore",
1516
+ base_dir: Path | None = None,
1517
+ dir_manifest: DirectoryManifest | None = None,
1518
+ dataset_id: UUID | None = None,
1519
+ project_id: UUID | None = None,
1520
+ metadata: dict[str, Any] | None = None,
1521
+ tags: list[str] | None = None,
1522
+ ) -> list[DataStorageResponse]:
1523
+ """Upload a directory with single dataset and individual file storage entries (async)."""
1524
+ responses = []
1525
+
1526
+ if parent_id is None:
1527
+ base_dir = dir_path
1528
+ all_ignore_patterns = _collect_ignore_patterns(
1529
+ base_dir, ignore_patterns, ignore_filename
1530
+ )
1531
+
1532
+ payload = DataStorageRequestPayload(
1533
+ name=name,
1534
+ description=description,
1535
+ parent_id=None,
1536
+ dataset_id=None,
1537
+ is_collection=False,
1538
+ project_id=project_id,
1539
+ metadata=metadata,
1540
+ tags=tags,
1541
+ )
1542
+
1543
+ dir_response = await self._acreate_data_storage_entry(payload)
1544
+ responses.append(dir_response)
1545
+ current_parent_id = dir_response.data_storage.id
1546
+ current_dataset_id = dir_response.data_storage.dataset_id
1547
+
1548
+ dir_manifest = self._load_directory_manifest(
1549
+ manifest_filename, parent_id, dir_path
1550
+ )
1551
+ else:
1552
+ all_ignore_patterns = ignore_patterns or []
1553
+ current_parent_id = parent_id
1554
+ current_dataset_id = dataset_id
1555
+
1556
+ for item in dir_path.iterdir():
1557
+ if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
1558
+ continue
1559
+
1560
+ if item.is_dir():
1561
+ subdir_manifest = DirectoryManifest()
1562
+ if dir_manifest:
1563
+ entry = dir_manifest.entries.get(item.name)
1564
+ if isinstance(entry, DirectoryManifest):
1565
+ subdir_manifest = entry
1566
+ elif isinstance(entry, ManifestEntry):
1567
+ subdir_manifest = DirectoryManifest(entries={item.name: entry})
1568
+
1569
+ subdir_description = subdir_manifest.get_entry_description(item.name)
1570
+ if not subdir_description:
1571
+ subdir_description = self._generate_folder_description_from_files(
1572
+ item, subdir_manifest
1573
+ )
1574
+
1575
+ subdir_payload = DataStorageRequestPayload(
1576
+ name=item.name,
1577
+ description=subdir_description,
1578
+ parent_id=current_parent_id,
1579
+ dataset_id=current_dataset_id,
1580
+ is_collection=False,
1581
+ project_id=project_id,
1582
+ metadata=metadata,
1583
+ tags=tags,
1584
+ )
1585
+ subdir_response = await self._acreate_data_storage_entry(subdir_payload)
1586
+ responses.append(subdir_response)
1587
+
1588
+ subdir_responses = await self._aupload_directory_hierarchically(
1589
+ name=item.name,
1590
+ dir_path=item,
1591
+ description=None,
1592
+ manifest_filename=None,
1593
+ parent_id=subdir_response.data_storage.id,
1594
+ ignore_patterns=all_ignore_patterns,
1595
+ ignore_filename=ignore_filename,
1596
+ base_dir=base_dir,
1597
+ dir_manifest=subdir_manifest,
1598
+ dataset_id=current_dataset_id,
1599
+ project_id=project_id,
1600
+ metadata=metadata,
1601
+ tags=tags,
1602
+ )
1603
+ responses.extend(subdir_responses)
1604
+ elif item.is_file():
1605
+ file_response = await self._aprocess_file_item(
1606
+ item,
1607
+ dir_manifest or DirectoryManifest(),
1608
+ current_parent_id,
1609
+ current_dataset_id,
1610
+ )
1611
+ if file_response:
1612
+ responses.append(file_response)
1613
+
1614
+ return responses
1615
+
1616
+ @property
1617
+ def client(self) -> Client:
1618
+ raise NotImplementedError("client property must be implemented by subclass")
1619
+
1620
+ @property
1621
+ def async_client(self) -> AsyncClient:
1622
+ raise NotImplementedError(
1623
+ "async_client property must be implemented by subclass"
1624
+ )
1625
+
1626
+ @retry(
1627
+ stop=stop_after_attempt(3),
1628
+ wait=wait_exponential(multiplier=1, max=10),
1629
+ retry=retry_if_connection_error,
1630
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1631
+ )
1632
+ def store_text_content(
1633
+ self,
1634
+ name: str,
1635
+ content: str,
1636
+ description: str | None = None,
1637
+ file_path: str | None = None,
1638
+ project_id: UUID | None = None,
1639
+ metadata: dict[str, Any] | None = None,
1640
+ tags: list[str] | None = None,
1641
+ dataset_id: UUID | None = None,
1642
+ parent_id: UUID | None = None,
1643
+ ) -> DataStorageResponse:
1644
+ """Store content as a string in the data storage system.
1645
+
1646
+ Args:
1647
+ name: Name of the data storage entry
1648
+ content: Content to store as a string
1649
+ description: Optional description of the data storage entry
1650
+ file_path: Optional path for the data storage entry
1651
+ project_id: ID of the project this data storage entry belongs to
1652
+ metadata: Optional metadata for the data storage entry
1653
+ tags: Optional tags for the data storage entry
1654
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1655
+ parent_id: Optional parent ID for the data storage entry
1656
+
1657
+ Returns:
1658
+ DataStorageResponse: A Pydantic model containing:
1659
+ - data_storage: DataStorageEntry with fields:
1660
+ - id - Unique identifier for the data storage entry
1661
+ - name - Name of the data storage entry
1662
+ - description - Description of the data storage entry
1663
+ - content - Content of the data storage entry
1664
+ - embedding - Embedding vector for the content
1665
+ - is_collection - Whether this entry is a collection
1666
+ - tags - List of tags associated with the entry
1667
+ - parent_id - ID of the parent entry for hierarchical storage
1668
+ - project_id - ID of the project this entry belongs to
1669
+ - dataset_id - ID of the dataset this entry belongs to
1670
+ - file_path - Filepath in the storage system where this entry is located
1671
+ - bigquery_schema - Target BigQuery schema for the entry
1672
+ - user_id - ID of the user who created this entry
1673
+ - created_at - Timestamp when the entry was created
1674
+ - modified_at - Timestamp when the entry was last updated
1675
+ - storage_locations with each location containing:
1676
+ - id - Unique identifier for the storage location
1677
+ - data_storage_id - ID of the associated data storage entry
1678
+ - storage_config pydantic model with fields:
1679
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
1680
+ - content_type - Type of content stored
1681
+ - content_schema - Content schema
1682
+ - metadata - Location metadata
1683
+ - location - Location path or identifier
1684
+ - signed_url - Signed URL for uploading/downloading
1685
+
1686
+ Raises:
1687
+ DataStorageCreationError: If there's an error creating the data storage entry
1688
+ """
1689
+ try:
1690
+ payload = DataStorageRequestPayload(
1691
+ name=name,
1692
+ content=content,
1693
+ description=description,
1694
+ file_path=file_path,
1695
+ project_id=project_id,
1696
+ metadata=metadata,
1697
+ tags=tags,
1698
+ dataset_id=dataset_id,
1699
+ parent_id=parent_id,
1700
+ )
1701
+ return self._create_data_storage_entry(payload)
1702
+ except HTTPStatusError as e:
1703
+ self._handle_http_errors(e, "creating")
1704
+ except Exception as e:
1705
+ raise DataStorageCreationError(
1706
+ f"An unexpected error occurred: {e!r}"
1707
+ ) from e
1708
+
1709
+ @retry(
1710
+ stop=stop_after_attempt(3),
1711
+ wait=wait_exponential(multiplier=1, max=10),
1712
+ retry=retry_if_connection_error,
1713
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1714
+ )
1715
+ async def astore_text_content(
1716
+ self,
1717
+ name: str,
1718
+ content: str,
1719
+ description: str | None = None,
1720
+ file_path: str | None = None,
1721
+ dataset_id: UUID | None = None,
1722
+ project_id: UUID | None = None,
1723
+ metadata: dict[str, Any] | None = None,
1724
+ tags: list[str] | None = None,
1725
+ parent_id: UUID | None = None,
1726
+ ) -> DataStorageResponse:
1727
+ """Asynchronously store content as a string in the data storage system.
1728
+
1729
+ Args:
1730
+ name: Name of the data storage entry
1731
+ content: Content to store as a string
1732
+ description: Optional description of the data storage entry
1733
+ file_path: Optional path for the data storage entry
1734
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1735
+ project_id: ID of the project this data storage entry belongs to
1736
+ metadata: Optional metadata for the data storage entry
1737
+ tags: Optional tags for the data storage entry
1738
+ parent_id: Optional parent ID for the data storage entry
1739
+
1740
+ Returns:
1741
+ DataStorageResponse: A Pydantic model containing:
1742
+ - data_storage: DataStorageEntry with fields:
1743
+ - id - Unique identifier for the data storage entry
1744
+ - name - Name of the data storage entry
1745
+ - description - Description of the data storage entry
1746
+ - content - Content of the data storage entry
1747
+ - embedding - Embedding vector for the content
1748
+ - is_collection - Whether this entry is a collection
1749
+ - tags - List of tags associated with the entry
1750
+ - parent_id - ID of the parent entry for hierarchical storage
1751
+ - project_id - ID of the project this entry belongs to
1752
+ - dataset_id - ID of the dataset this entry belongs to
1753
+ - file_path - Filepath in the storage system where this entry is located
1754
+ - bigquery_schema - Target BigQuery schema for the entry
1755
+ - user_id - ID of the user who created this entry
1756
+ - created_at - Timestamp when the entry was created
1757
+ - modified_at - Timestamp when the entry was last updated
1758
+ - storage_locations with each location containing:
1759
+ - id - Unique identifier for the storage location
1760
+ - data_storage_id - ID of the associated data storage entry
1761
+ - storage_config pydantic model with fields:
1762
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
1763
+ - content_type - Type of content stored
1764
+ - content_schema - Content schema
1765
+ - metadata - Location metadata
1766
+ - location - Location path or identifier
1767
+ - signed_url - Signed URL for uploading/downloading
1768
+
1769
+ Raises:
1770
+ DataStorageCreationError: If there's an error creating the data storage entry
1771
+ """
1772
+ try:
1773
+ payload = DataStorageRequestPayload(
1774
+ name=name,
1775
+ content=content,
1776
+ description=description,
1777
+ file_path=file_path,
1778
+ dataset_id=dataset_id,
1779
+ project_id=project_id,
1780
+ metadata=metadata,
1781
+ tags=tags,
1782
+ parent_id=parent_id,
1783
+ )
1784
+ return await self._acreate_data_storage_entry(payload)
1785
+ except HTTPStatusError as e:
1786
+ self._handle_http_errors(e, "creating")
1787
+ except Exception as e:
1788
+ raise DataStorageCreationError(
1789
+ f"An unexpected error occurred: {e!r}"
1790
+ ) from e
1791
+
1792
+ @retry(
1793
+ stop=stop_after_attempt(3),
1794
+ wait=wait_exponential(multiplier=1, max=10),
1795
+ retry=retry_if_connection_error,
1796
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1797
+ )
1798
+ async def astore_link(
1799
+ self,
1800
+ name: str,
1801
+ url: HttpUrl,
1802
+ description: str,
1803
+ instructions: str,
1804
+ api_key: str | None = None,
1805
+ metadata: dict[str, Any] | None = None,
1806
+ dataset_id: UUID | None = None,
1807
+ project_id: UUID | None = None,
1808
+ tags: list[str] | None = None,
1809
+ parent_id: UUID | None = None,
1810
+ ) -> DataStorageResponse:
1811
+ """Asynchronously store a link/URL in the data storage system.
1812
+
1813
+ Args:
1814
+ name: Name of the link entry
1815
+ url: The URL/link to store
1816
+ description: Searchable details of the link
1817
+ instructions: Instructions for how to consume the link or api
1818
+ api_key: Any authentication key to access the api. If this is included, you should also include
1819
+ details of how the key should be consumed in the instructions.
1820
+ metadata: Any additional metadata about the link
1821
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1822
+ project_id: ID of the project this data storage entry belongs to
1823
+ tags: Optional tags for the data storage entry
1824
+ parent_id: Optional parent ID for the data storage entry
1825
+
1826
+ Returns:
1827
+ DataStorageResponse containing the created link storage entry
1828
+
1829
+ Raises:
1830
+ DataStorageCreationError: If there's an error creating the link storage entry
1831
+ """
1832
+ try:
1833
+ link_metadata = metadata.copy() if metadata else {}
1834
+ link_metadata["instructions"] = instructions
1835
+ if api_key:
1836
+ link_metadata["api_key"] = api_key
1837
+
1838
+ existing_location = DataStorageLocationPayload(
1839
+ storage_type=DataStorageType.LINK,
1840
+ content_type=DataContentType.TEXT,
1841
+ location=str(url),
1842
+ metadata=link_metadata or None,
1843
+ )
1844
+
1845
+ payload = DataStorageRequestPayload(
1846
+ name=name,
1847
+ content=str(url),
1848
+ description=description,
1849
+ dataset_id=dataset_id,
1850
+ project_id=project_id,
1851
+ existing_location=existing_location,
1852
+ tags=tags,
1853
+ metadata=metadata,
1854
+ parent_id=parent_id,
1855
+ )
1856
+ return await self._acreate_data_storage_entry(payload)
1857
+ except HTTPStatusError as e:
1858
+ self._handle_http_errors(e, "creating")
1859
+ except Exception as e:
1860
+ raise DataStorageCreationError(
1861
+ f"An unexpected error occurred: {e!r}"
1862
+ ) from e
1863
+
1864
+ def store_link(
1865
+ self,
1866
+ name: str,
1867
+ url: HttpUrl,
1868
+ description: str,
1869
+ instructions: str,
1870
+ api_key: str | None = None,
1871
+ metadata: dict[str, Any] | None = None,
1872
+ dataset_id: UUID | None = None,
1873
+ project_id: UUID | None = None,
1874
+ tags: list[str] | None = None,
1875
+ parent_id: UUID | None = None,
1876
+ ) -> DataStorageResponse:
1877
+ """Store a link/URL in the data storage system.
1878
+
1879
+ Args:
1880
+ name: Name of the link entry
1881
+ url: The URL/link to store
1882
+ description: Searchable details of the link
1883
+ instructions: Instructions for how to consume the link or api
1884
+ api_key: Any authentication key to access the api. If this is included, you should also include
1885
+ details of how the key should be consumed in the instructions.
1886
+ metadata: Any additional metadata about the link
1887
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1888
+ project_id: ID of the project this data storage entry belongs to
1889
+ tags: Optional tags for the data storage entry
1890
+ parent_id: Optional parent ID for the data storage entry
1891
+
1892
+ Returns:
1893
+ DataStorageResponse containing the created link storage entry
1894
+
1895
+ Raises:
1896
+ DataStorageCreationError: If there's an error creating the link storage entry
1897
+ """
1898
+ try:
1899
+ link_metadata = metadata.copy() if metadata else {}
1900
+ link_metadata["instructions"] = instructions
1901
+ if api_key:
1902
+ link_metadata["api_key"] = api_key
1903
+
1904
+ existing_location = DataStorageLocationPayload(
1905
+ storage_type=DataStorageType.LINK,
1906
+ content_type=DataContentType.TEXT,
1907
+ location=str(url),
1908
+ metadata=link_metadata or None,
1909
+ )
1910
+
1911
+ payload = DataStorageRequestPayload(
1912
+ name=name,
1913
+ content=str(url),
1914
+ description=description,
1915
+ dataset_id=dataset_id,
1916
+ project_id=project_id,
1917
+ existing_location=existing_location,
1918
+ tags=tags,
1919
+ metadata=metadata,
1920
+ parent_id=parent_id,
1921
+ )
1922
+ return self._create_data_storage_entry(payload)
1923
+ except HTTPStatusError as e:
1924
+ self._handle_http_errors(e, "creating")
1925
+ except Exception as e:
1926
+ raise DataStorageCreationError(
1927
+ f"An unexpected error occurred: {e!r}"
1928
+ ) from e
1929
+
1930
+ @retry(
1931
+ stop=stop_after_attempt(3),
1932
+ wait=wait_exponential(multiplier=1, max=10),
1933
+ retry=retry_if_connection_error,
1934
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1935
+ )
1936
+ def store_file_content(
1937
+ self,
1938
+ name: str,
1939
+ file_path: str | Path,
1940
+ description: str | None = None,
1941
+ file_path_override: str | Path | None = None,
1942
+ as_collection: bool = False,
1943
+ manifest_filename: str | None = None,
1944
+ ignore_patterns: list[str] | None = None,
1945
+ ignore_filename: str = ".gitignore",
1946
+ project_id: UUID | None = None,
1947
+ dataset_id: UUID | None = None,
1948
+ metadata: dict[str, Any] | None = None,
1949
+ tags: list[str] | None = None,
1950
+ parent_id: UUID | None = None,
1951
+ ) -> DataStorageResponse:
1952
+ """Store file or directory content in the data storage system.
1953
+
1954
+ For files: Small text files (< 10MB, supported formats) are sent as text content,
1955
+ larger/binary files are uploaded via signed URL.
1956
+
1957
+ For directories: Zipped as a single file with ignore pattern support and uploaded
1958
+ as a collection.
1959
+
1960
+ Args:
1961
+ name: Name of the data storage entry
1962
+ file_path: Path to file or directory to upload
1963
+ description: Optional description of the data storage entry
1964
+ file_path_override: Optional path for the data storage entry
1965
+ as_collection: If true, upload directories as a single zip file collection.
1966
+ manifest_filename: Name of manifest file (JSON or YAML) containing:
1967
+ - entries - Map of file/directory names to their manifest entries
1968
+ - Each ManifestEntry contains:
1969
+ - description - Description of the file or directory
1970
+ - metadata - Additional metadata for the entry
1971
+ - Each DirectoryManifest contains nested entries following the same structure
1972
+ ignore_patterns: List of patterns to ignore when zipping directories
1973
+ ignore_filename: Name of ignore file to read from directory (default: .gitignore)
1974
+ project_id: ID of the project this data storage entry belongs to
1975
+ dataset_id: ID of the dataset this data storage entry belongs to
1976
+ metadata: Optional metadata for the data storage entry
1977
+ tags: Optional tags for the data storage entry
1978
+ parent_id: Optional parent ID for the data storage entry
1979
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1980
+
1981
+ Returns:
1982
+ DataStorageResponse: A Pydantic model containing:
1983
+ - data_storage: DataStorageEntry with fields:
1984
+ - id - Unique identifier for the data storage entry
1985
+ - name - Name of the data storage entry
1986
+ - description - Description of the data storage entry
1987
+ - content - Content of the data storage entry
1988
+ - embedding - Embedding vector for the content
1989
+ - is_collection - Whether this entry is a collection
1990
+ - tags - List of tags associated with the entry
1991
+ - parent_id - ID of the parent entry for hierarchical storage
1992
+ - project_id - ID of the project this entry belongs to
1993
+ - dataset_id - ID of the dataset this entry belongs to
1994
+ - file_path - Filepath in the storage system where this entry is located
1995
+ - bigquery_schema - Target BigQuery schema for the entry
1996
+ - user_id - ID of the user who created this entry
1997
+ - created_at - Timestamp when the entry was created
1998
+ - modified_at - Timestamp when the entry was last updated
1999
+ - storage_locations with each location containing:
2000
+ - id - Unique identifier for the storage location
2001
+ - data_storage_id - ID of the associated data storage entry
2002
+ - storage_config pydantic model with fields:
2003
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
2004
+ - content_type - Type of content stored
2005
+ - content_schema - Content schema
2006
+ - metadata - Location metadata
2007
+ - location - Location path or identifier
2008
+ - signed_url - Signed URL for uploading/downloading
2009
+
2010
+ Raises:
2011
+ DataStorageCreationError: If there's an error in the process
2012
+ """
2013
+ file_path = self._validate_file_path(file_path)
2014
+
2015
+ try:
2016
+ if file_path.is_dir() and as_collection:
2017
+ return self._upload_data_directory(
2018
+ name=name,
2019
+ dir_path=file_path,
2020
+ description=description,
2021
+ dir_path_override=file_path_override,
2022
+ ignore_patterns=ignore_patterns,
2023
+ ignore_filename=ignore_filename,
2024
+ project_id=project_id,
2025
+ dataset_id=dataset_id,
2026
+ parent_id=parent_id,
2027
+ metadata=metadata,
2028
+ tags=tags,
2029
+ )
2030
+ if file_path.is_dir() and not as_collection:
2031
+ responses = self._upload_directory_hierarchically(
2032
+ name=name,
2033
+ dir_path=file_path,
2034
+ description=description,
2035
+ manifest_filename=manifest_filename,
2036
+ ignore_patterns=ignore_patterns,
2037
+ ignore_filename=ignore_filename,
2038
+ project_id=project_id,
2039
+ dataset_id=dataset_id,
2040
+ parent_id=parent_id,
2041
+ metadata=metadata,
2042
+ tags=tags,
2043
+ )
2044
+ if not responses:
2045
+ raise DataStorageCreationError(
2046
+ "No data storage entries were created"
2047
+ )
2048
+ return responses[0]
2049
+ return self._upload_data_single_file(
2050
+ name, file_path, description, file_path_override, project_id
2051
+ )
2052
+
2053
+ except HTTPStatusError as e:
2054
+ self._handle_http_errors(e, "creating")
2055
+ except Exception as e:
2056
+ raise DataStorageCreationError(
2057
+ f"An unexpected error occurred during file upload: {e!r}"
2058
+ ) from e
2059
+
2060
+ @retry(
2061
+ stop=stop_after_attempt(3),
2062
+ wait=wait_exponential(multiplier=1, max=10),
2063
+ retry=retry_if_connection_error,
2064
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2065
+ )
2066
+ async def astore_file_content(
2067
+ self,
2068
+ name: str,
2069
+ file_path: str | Path,
2070
+ description: str | None = None,
2071
+ file_path_override: str | Path | None = None,
2072
+ as_collection: bool = False,
2073
+ manifest_filename: str | None = None,
2074
+ ignore_patterns: list[str] | None = None,
2075
+ ignore_filename: str = ".gitignore",
2076
+ dataset_id: UUID | None = None,
2077
+ project_id: UUID | None = None,
2078
+ metadata: dict[str, Any] | None = None,
2079
+ tags: list[str] | None = None,
2080
+ parent_id: UUID | None = None,
2081
+ ) -> DataStorageResponse:
2082
+ """Asynchronously store file or directory content in the data storage system.
2083
+
2084
+ Args:
2085
+ name: Name of the data storage entry.
2086
+ file_path: Path to the file or directory to upload.
2087
+ description: Optional description for the entry.
2088
+ file_path_override: Optional GCS path for the entry.
2089
+ as_collection: If uploading a directory, `True` zips it into a single collection,
2090
+ `False` uploads it as a hierarchical structure of individual objects.
2091
+ manifest_filename: Optional manifest file (JSON or YAML) for hierarchical uploads containing:
2092
+ - entries - Map of file/directory names to their manifest entries
2093
+ - Each ManifestEntry contains:
2094
+ - description - Description of the file or directory
2095
+ - metadata - Additional metadata for the entry
2096
+ - Each DirectoryManifest contains nested entries following the same structure
2097
+ ignore_patterns: List of patterns to ignore when zipping.
2098
+ ignore_filename: Name of ignore file to read (default: .gitignore).
2099
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset.
2100
+ project_id: ID of the project this data storage entry belongs to
2101
+ metadata: Optional metadata for the data storage entry
2102
+ tags: Optional tags for the data storage entry
2103
+ parent_id: Optional parent ID for the data storage entry
2104
+
2105
+ Returns:
2106
+ DataStorageResponse: A Pydantic model containing:
2107
+ - data_storage: DataStorageEntry with fields:
2108
+ - id - Unique identifier for the data storage entry
2109
+ - name - Name of the data storage entry
2110
+ - description - Description of the data storage entry
2111
+ - content - Content of the data storage entry
2112
+ - embedding - Embedding vector for the content
2113
+ - is_collection - Whether this entry is a collection
2114
+ - tags - List of tags associated with the entry
2115
+ - parent_id - ID of the parent entry for hierarchical storage
2116
+ - project_id - ID of the project this entry belongs to
2117
+ - dataset_id - ID of the dataset this entry belongs to
2118
+ - file_path - Filepath in the storage system where this entry is located
2119
+ - bigquery_schema - Target BigQuery schema for the entry
2120
+ - user_id - ID of the user who created this entry
2121
+ - created_at - Timestamp when the entry was created
2122
+ - modified_at - Timestamp when the entry was last updated
2123
+ - storage_locations with each location containing:
2124
+ - id - Unique identifier for the storage location
2125
+ - data_storage_id - ID of the associated data storage entry
2126
+ - storage_config pydantic model with fields:
2127
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
2128
+ - content_type - Type of content stored
2129
+ - content_schema - Content schema
2130
+ - metadata - Location metadata
2131
+ - location - Location path or identifier
2132
+ - signed_url - Signed URL for uploading/downloading
2133
+
2134
+ For hierarchical uploads, this is the response for the root directory entry.
2135
+ """
2136
+ file_path = self._validate_file_path(file_path)
2137
+
2138
+ try:
2139
+ if file_path.is_dir():
2140
+ if as_collection:
2141
+ return await self._aupload_data_directory(
2142
+ name=name,
2143
+ dir_path=file_path,
2144
+ description=description,
2145
+ dir_path_override=file_path_override,
2146
+ ignore_patterns=ignore_patterns,
2147
+ ignore_filename=ignore_filename,
2148
+ project_id=project_id,
2149
+ metadata=metadata,
2150
+ tags=tags,
2151
+ dataset_id=dataset_id,
2152
+ parent_id=parent_id,
2153
+ )
2154
+ responses = await self._aupload_directory_hierarchically(
2155
+ name=name,
2156
+ dir_path=file_path,
2157
+ description=description,
2158
+ manifest_filename=manifest_filename,
2159
+ ignore_patterns=ignore_patterns,
2160
+ ignore_filename=ignore_filename,
2161
+ dataset_id=dataset_id,
2162
+ project_id=project_id,
2163
+ metadata=metadata,
2164
+ tags=tags,
2165
+ parent_id=parent_id,
2166
+ )
2167
+ if not responses:
2168
+ raise DataStorageCreationError(
2169
+ "No data storage entries were created"
2170
+ )
2171
+ return responses[0]
2172
+ return await self._aupload_data_single_file(
2173
+ name, file_path, description, file_path_override, dataset_id, project_id
2174
+ )
2175
+
2176
+ except HTTPStatusError as e:
2177
+ self._handle_http_errors(e, "creating")
2178
+ except Exception as e:
2179
+ raise DataStorageCreationError(
2180
+ f"An unexpected error occurred during async file upload: {e!r}"
2181
+ ) from e
2182
+
2183
+ @retry(
2184
+ stop=stop_after_attempt(3),
2185
+ wait=wait_exponential(multiplier=1, max=10),
2186
+ retry=retry_if_connection_error,
2187
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2188
+ )
2189
+ def register_existing_data_source(
2190
+ self,
2191
+ name: str,
2192
+ existing_location: DataStorageLocationPayload,
2193
+ description: str | None = None,
2194
+ as_collection: bool = False,
2195
+ project_id: UUID | None = None,
2196
+ metadata: dict[str, Any] | None = None,
2197
+ tags: list[str] | None = None,
2198
+ parent_id: UUID | None = None,
2199
+ dataset_id: UUID | None = None,
2200
+ ) -> DataStorageResponse:
2201
+ """Store content as a string in the data storage system.
2202
+
2203
+ Args:
2204
+ name: Name of the data storage entry
2205
+ existing_location: a pydantic model describing the existing data source location to register, containing:
2206
+ - storage_type - Type of storage (BIGQUERY, GCS, PG_TABLE, RAW_CONTENT, ELASTIC_SEARCH)
2207
+ - content_type - Type of content (BQ_DATASET, BQ_TABLE, TEXT, TEXT_W_EMBEDDINGS, DIRECTORY, FILE, INDEX, INDEX_W_EMBEDDINGS)
2208
+ - content_schema - Content schema for the data
2209
+ - metadata - Additional metadata for the location
2210
+ - location - Location path or identifier
2211
+ description: Optional description of the data storage entry
2212
+ as_collection: If uploading a directory, `True` creates a single storage entry for
2213
+ the whole directory and multiple storage locations for each file, `False` assumes
2214
+ you are uploading a single file.
2215
+ file_path: Optional path for the data storage entry
2216
+ project_id: ID of the project this data storage entry belongs to
2217
+ metadata: Optional metadata for the data storage entry
2218
+ tags: Optional tags for the data storage entry
2219
+ parent_id: Optional parent ID for the data storage entry
2220
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
2221
+
2222
+ Returns:
2223
+ DataStorageResponse: A Pydantic model containing:
2224
+ - data_storage: DataStorageEntry with fields:
2225
+ - id - Unique identifier for the data storage entry
2226
+ - name - Name of the data storage entry
2227
+ - description - Description of the data storage entry
2228
+ - content - Content of the data storage entry
2229
+ - embedding - Embedding vector for the content
2230
+ - is_collection - Whether this entry is a collection
2231
+ - tags - List of tags associated with the entry
2232
+ - parent_id - ID of the parent entry for hierarchical storage
2233
+ - project_id - ID of the project this entry belongs to
2234
+ - dataset_id - ID of the dataset this entry belongs to
2235
+ - file_path - Filepath in the storage system where this entry is located
2236
+ - bigquery_schema - Target BigQuery schema for the entry
2237
+ - user_id - ID of the user who created this entry
2238
+ - created_at - Timestamp when the entry was created
2239
+ - modified_at - Timestamp when the entry was last updated
2240
+ - storage_locations with each location containing:
2241
+ - id - Unique identifier for the storage location
2242
+ - data_storage_id - ID of the associated data storage entry
2243
+ - storage_config pydantic model with fields:
2244
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
2245
+ - content_type - Type of content stored
2246
+ - content_schema - Content schema
2247
+ - metadata - Location metadata
2248
+ - location - Location path or identifier
2249
+ - signed_url - Signed URL for uploading/downloading
2250
+
2251
+ Raises:
2252
+ DataStorageCreationError: If there's an error creating the data storage entry
2253
+ """
2254
+ try:
2255
+ payload = DataStorageRequestPayload(
2256
+ name=name,
2257
+ description=description,
2258
+ existing_location=existing_location,
2259
+ project_id=project_id,
2260
+ is_collection=as_collection,
2261
+ metadata=metadata,
2262
+ tags=tags,
2263
+ parent_id=parent_id,
2264
+ dataset_id=dataset_id,
2265
+ )
2266
+ response = self.client.post(
2267
+ "/v0.1/data-storage/data-entries",
2268
+ json=payload.model_dump(exclude_none=True),
2269
+ )
2270
+ response.raise_for_status()
2271
+ return DataStorageResponse.model_validate(response.json())
2272
+ except HTTPStatusError as e:
2273
+ self._handle_http_errors(e, "creating")
2274
+ except Exception as e:
2275
+ raise DataStorageCreationError(
2276
+ f"An unexpected error occurred: {e!r}"
2277
+ ) from e
2278
+
2279
+ @retry(
2280
+ stop=stop_after_attempt(3),
2281
+ wait=wait_exponential(multiplier=1, max=10),
2282
+ retry=retry_if_connection_error,
2283
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2284
+ )
2285
+ async def aregister_existing_data_source(
2286
+ self,
2287
+ name: str,
2288
+ existing_location: DataStorageLocationPayload,
2289
+ as_collection: bool = False,
2290
+ description: str | None = None,
2291
+ project_id: UUID | None = None,
2292
+ metadata: dict[str, Any] | None = None,
2293
+ tags: list[str] | None = None,
2294
+ parent_id: UUID | None = None,
2295
+ dataset_id: UUID | None = None,
2296
+ ) -> DataStorageResponse:
2297
+ """Store content as a string in the data storage system.
2298
+
2299
+ Args:
2300
+ name: Name of the data storage entry
2301
+ existing_location: a pydantic model describing the existing data source location to register, containing:
2302
+ - storage_type - Type of storage (BIGQUERY, GCS, PG_TABLE, RAW_CONTENT, ELASTIC_SEARCH)
2303
+ - content_type - Type of content (BQ_DATASET, BQ_TABLE, TEXT, TEXT_W_EMBEDDINGS, DIRECTORY, FILE, INDEX, INDEX_W_EMBEDDINGS)
2304
+ - content_schema - Content schema for the data
2305
+ - metadata - Additional metadata for the location
2306
+ - location - Location path or identifier
2307
+ description: Optional description of the data storage entry
2308
+ as_collection: If uploading a directory, `True` creates a single storage entry for
2309
+ the whole directory and multiple storage locations for each file, `False` assumes
2310
+ you are uploading a single file.
2311
+ file_path: Optional path for the data storage entry
2312
+ project_id: ID of the project this data storage entry belongs to
2313
+ metadata: Optional metadata for the data storage entry
2314
+ tags: Optional tags for the data storage entry
2315
+ parent_id: Optional parent ID for the data storage entry
2316
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
2317
+
2318
+ Returns:
2319
+ DataStorageResponse: A Pydantic model containing:
2320
+ - data_storage: DataStorageEntry with fields:
2321
+ - id - Unique identifier for the data storage entry
2322
+ - name - Name of the data storage entry
2323
+ - description - Description of the data storage entry
2324
+ - content - Content of the data storage entry
2325
+ - embedding - Embedding vector for the content
2326
+ - is_collection - Whether this entry is a collection
2327
+ - tags - List of tags associated with the entry
2328
+ - parent_id - ID of the parent entry for hierarchical storage
2329
+ - project_id - ID of the project this entry belongs to
2330
+ - dataset_id - ID of the dataset this entry belongs to
2331
+ - file_path - Filepath in the storage system where this entry is located
2332
+ - bigquery_schema - Target BigQuery schema for the entry
2333
+ - user_id - ID of the user who created this entry
2334
+ - created_at - Timestamp when the entry was created
2335
+ - modified_at - Timestamp when the entry was last updated
2336
+ - storage_locations with each location containing:
2337
+ - id - Unique identifier for the storage location
2338
+ - data_storage_id - ID of the associated data storage entry
2339
+ - storage_config pydantic model with fields:
2340
+ - storage_type - Type of storage (e.g., 'gcs', 'pg_table')
2341
+ - content_type - Type of content stored
2342
+ - content_schema - Content schema
2343
+ - metadata - Location metadata
2344
+ - location - Location path or identifier
2345
+ - signed_url - Signed URL for uploading/downloading
2346
+
2347
+ Raises:
2348
+ DataStorageCreationError: If there's an error creating the data storage entry
2349
+ """
2350
+ try:
2351
+ payload = DataStorageRequestPayload(
2352
+ name=name,
2353
+ description=description,
2354
+ existing_location=existing_location,
2355
+ project_id=project_id,
2356
+ is_collection=as_collection,
2357
+ metadata=metadata,
2358
+ tags=tags,
2359
+ parent_id=parent_id,
2360
+ dataset_id=dataset_id,
2361
+ )
2362
+ response = await self.async_client.post(
2363
+ "/v0.1/data-storage/data-entries",
2364
+ json=payload.model_dump(exclude_none=True),
2365
+ )
2366
+ response.raise_for_status()
2367
+ return DataStorageResponse.model_validate(response.json())
2368
+ except HTTPStatusError as e:
2369
+ self._handle_http_errors(e, "creating")
2370
+ except Exception as e:
2371
+ raise DataStorageCreationError(
2372
+ f"An unexpected error occurred: {e!r}"
2373
+ ) from e
2374
+
2375
+ @retry(
2376
+ stop=stop_after_attempt(3),
2377
+ wait=wait_exponential(multiplier=1, max=10),
2378
+ retry=retry_if_connection_error,
2379
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2380
+ )
2381
+ def search_data_storage(
2382
+ self,
2383
+ criteria: list[SearchCriterion] | None = None,
2384
+ limit: int = 10,
2385
+ offset: int = 0,
2386
+ filter_logic: FilterLogic = FilterLogic.OR,
2387
+ ) -> list[dict]:
2388
+ """Search data storage objects using structured criteria.
2389
+
2390
+ Args:
2391
+ criteria: List of SearchCriterion pydantic models with fields:
2392
+ - field - Field name to search on
2393
+ - operator - Search operator (EQUALS, CONTAINS, STARTS_WITH, ENDS_WITH, GREATER_THAN, LESS_THAN, BETWEEN, IN)
2394
+ - value - Value to search for
2395
+ limit: Number of results to return (1-100)
2396
+ offset: Number of results to skip
2397
+ filter_logic: Either "AND" (all criteria must match) or "OR" (at least one must match)
2398
+
2399
+ Returns:
2400
+ List of search results with scores and data storage information
2401
+
2402
+ Raises:
2403
+ DataStorageCreationError: If there's an error searching data storage entries
2404
+
2405
+ Example:
2406
+ from edison_client.models.rest import SearchCriterion, SearchOperator
2407
+ criteria = [
2408
+ SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
2409
+ SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
2410
+ SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
2411
+ ]
2412
+ results = client.search_data_storage(criteria=criteria, size=20)
2413
+ """
2414
+ try:
2415
+ payload = DataStorageSearchPayload(
2416
+ criteria=criteria or [],
2417
+ limit=max(1, min(100, limit)), # Clamp between 1-100
2418
+ offset=offset,
2419
+ filter_logic=filter_logic,
2420
+ )
2421
+
2422
+ response = self.client.post(
2423
+ "/v0.1/data-storage/search",
2424
+ json=payload.model_dump(mode="json"),
2425
+ )
2426
+ response.raise_for_status()
2427
+ return response.json()
2428
+
2429
+ except HTTPStatusError as e:
2430
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2431
+ raise DataStorageCreationError(
2432
+ "Search functionality is currently unavailable"
2433
+ ) from e
2434
+ self._handle_http_errors(e, "searching")
2435
+ except Exception as e:
2436
+ raise DataStorageCreationError(
2437
+ f"An unexpected error occurred during search: {e!r}"
2438
+ ) from e
2439
+
2440
+ @retry(
2441
+ stop=stop_after_attempt(3),
2442
+ wait=wait_exponential(multiplier=1, max=10),
2443
+ retry=retry_if_connection_error,
2444
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2445
+ )
2446
+ async def asearch_data_storage(
2447
+ self,
2448
+ criteria: list[SearchCriterion] | None = None,
2449
+ limit: int = 10,
2450
+ offset: int = 0,
2451
+ filter_logic: FilterLogic = FilterLogic.OR,
2452
+ ) -> list[dict]:
2453
+ """Asynchronously search data storage objects using structured criteria.
2454
+
2455
+ Args:
2456
+ criteria: List of SearchCriterion pydantic models with fields:
2457
+ - field - Field name to search on
2458
+ - operator - Search operator (EQUALS, CONTAINS, STARTS_WITH, ENDS_WITH, GREATER_THAN, LESS_THAN, BETWEEN, IN)
2459
+ - value - Value to search for
2460
+ limit: Number of results to return (1-100)
2461
+ offset: Number of results to skip
2462
+ filter_logic: Either "AND" (all criteria must match) or "OR" (at least one must match)
2463
+
2464
+ Returns:
2465
+ List of search results with scores and data storage information
2466
+
2467
+ Raises:
2468
+ DataStorageCreationError: If there's an error searching data storage entries
2469
+
2470
+ Example:
2471
+ from edison_client.models.rest import SearchCriterion, SearchOperator
2472
+ criteria = [
2473
+ SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
2474
+ SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
2475
+ SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
2476
+ ]
2477
+ results = await client.asearch_data_storage(criteria=criteria, size=20)
2478
+ """
2479
+ try:
2480
+ payload = DataStorageSearchPayload(
2481
+ criteria=criteria or [],
2482
+ limit=max(1, min(100, limit)), # Clamp between 1-100
2483
+ offset=offset,
2484
+ filter_logic=filter_logic,
2485
+ )
2486
+
2487
+ response = await self.async_client.post(
2488
+ "/v0.1/data-storage/search",
2489
+ json=payload.model_dump(mode="json"),
2490
+ )
2491
+ response.raise_for_status()
2492
+ return response.json()
2493
+
2494
+ except HTTPStatusError as e:
2495
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2496
+ raise DataStorageCreationError(
2497
+ "Search functionality is currently unavailable"
2498
+ ) from e
2499
+ self._handle_http_errors(e, "searching")
2500
+ except Exception as e:
2501
+ raise DataStorageCreationError(
2502
+ f"An unexpected error occurred during async search: {e!r}"
2503
+ ) from e
2504
+
2505
+ @retry(
2506
+ stop=stop_after_attempt(3),
2507
+ wait=wait_exponential(multiplier=1, max=10),
2508
+ retry=retry_if_connection_error,
2509
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2510
+ )
2511
+ def similarity_search_data_storage(
2512
+ self,
2513
+ embedding: list[float],
2514
+ size: int = 10,
2515
+ min_score: float = 0.7,
2516
+ dataset_id: UUID | None = None,
2517
+ tags: list[str] | None = None,
2518
+ user_id: str | None = None,
2519
+ project_id: str | None = None,
2520
+ ) -> list[dict]:
2521
+ """Search data storage objects using vector similarity.
2522
+
2523
+ Args:
2524
+ embedding: List of float values representing the embedding vector for similarity search
2525
+ size: Number of results to return (1-100)
2526
+ min_score: Minimum similarity score (0.0-1.0)
2527
+ dataset_id: Optional dataset ID filter
2528
+ tags: Optional list of string tags to filter by
2529
+ user_id: Optional user ID filter (admin only)
2530
+ project_id: Optional project ID filter
2531
+
2532
+ Returns:
2533
+ List of search results with similarity scores and data storage information
2534
+
2535
+ Raises:
2536
+ DataStorageCreationError: If there's an error performing similarity search
2537
+ """
2538
+ try:
2539
+ # Validate inputs
2540
+ if not embedding:
2541
+ raise DataStorageCreationError("Embedding vector is required")
2542
+
2543
+ if not all(isinstance(x, int | float) for x in embedding):
2544
+ raise DataStorageCreationError("Embedding must be a list of numbers")
2545
+
2546
+ size = max(1, min(100, size)) # Clamp between 1-100
2547
+ min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
2548
+
2549
+ # Build request payload
2550
+ payload = {
2551
+ "embedding": embedding,
2552
+ "size": size,
2553
+ "min_score": min_score,
2554
+ }
2555
+
2556
+ # Add optional filters
2557
+ if dataset_id is not None:
2558
+ payload["dataset_id"] = str(dataset_id)
2559
+ if tags is not None:
2560
+ payload["tags"] = tags
2561
+ if user_id is not None:
2562
+ payload["user_id"] = user_id
2563
+ if project_id is not None:
2564
+ payload["project_id"] = project_id
2565
+
2566
+ response = self.client.post(
2567
+ "/v0.1/data-storage/similarity-search", json=payload
2568
+ )
2569
+ response.raise_for_status()
2570
+ return response.json()
2571
+
2572
+ except HTTPStatusError as e:
2573
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2574
+ raise DataStorageCreationError(
2575
+ "Similarity search functionality is currently unavailable"
2576
+ ) from e
2577
+ self._handle_http_errors(e, "performing similarity search")
2578
+ except Exception as e:
2579
+ raise DataStorageCreationError(
2580
+ f"An unexpected error occurred during similarity search: {e!r}"
2581
+ ) from e
2582
+
2583
+ @retry(
2584
+ stop=stop_after_attempt(3),
2585
+ wait=wait_exponential(multiplier=1, max=10),
2586
+ retry=retry_if_connection_error,
2587
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2588
+ )
2589
+ async def asimilarity_search_data_storage(
2590
+ self,
2591
+ embedding: list[float],
2592
+ size: int = 10,
2593
+ min_score: float = 0.7,
2594
+ dataset_id: UUID | None = None,
2595
+ tags: list[str] | None = None,
2596
+ user_id: str | None = None,
2597
+ project_id: str | None = None,
2598
+ ) -> list[dict]:
2599
+ """Asynchronously search data storage objects using vector similarity.
2600
+
2601
+ Args:
2602
+ embedding: List of float values representing the embedding vector for similarity search
2603
+ size: Number of results to return (1-100)
2604
+ min_score: Minimum similarity score (0.0-1.0)
2605
+ dataset_id: Optional dataset ID filter
2606
+ tags: Optional list of string tags to filter by
2607
+ user_id: Optional user ID filter (admin only)
2608
+ project_id: Optional project ID filter
2609
+
2610
+ Returns:
2611
+ List of search results with similarity scores and data storage information
2612
+
2613
+ Raises:
2614
+ DataStorageCreationError: If there's an error performing similarity search
2615
+ """
2616
+ try:
2617
+ # Validate inputs
2618
+ if not embedding:
2619
+ raise DataStorageCreationError("Embedding vector is required")
2620
+
2621
+ if not all(isinstance(x, int | float) for x in embedding):
2622
+ raise DataStorageCreationError("Embedding must be a list of numbers")
2623
+
2624
+ size = max(1, min(100, size)) # Clamp between 1-100
2625
+ min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
2626
+
2627
+ # Build request payload
2628
+ payload = {
2629
+ "embedding": embedding,
2630
+ "size": size,
2631
+ "min_score": min_score,
2632
+ }
2633
+
2634
+ # Add optional filters
2635
+ if dataset_id is not None:
2636
+ payload["dataset_id"] = str(dataset_id)
2637
+ if tags is not None:
2638
+ payload["tags"] = tags
2639
+ if user_id is not None:
2640
+ payload["user_id"] = user_id
2641
+ if project_id is not None:
2642
+ payload["project_id"] = project_id
2643
+
2644
+ response = await self.async_client.post(
2645
+ "/v0.1/data-storage/similarity-search", json=payload
2646
+ )
2647
+ response.raise_for_status()
2648
+ return response.json()
2649
+
2650
+ except HTTPStatusError as e:
2651
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2652
+ raise DataStorageCreationError(
2653
+ "Similarity search functionality is currently unavailable"
2654
+ ) from e
2655
+ self._handle_http_errors(e, "performing similarity search")
2656
+ except Exception as e:
2657
+ raise DataStorageCreationError(
2658
+ f"An unexpected error occurred during async similarity search: {e!r}"
2659
+ ) from e
2660
+
2661
+ @retry(
2662
+ stop=stop_after_attempt(3),
2663
+ wait=wait_exponential(multiplier=1, max=10),
2664
+ retry=retry_if_connection_error,
2665
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2666
+ )
2667
+ def fetch_data_from_storage(
2668
+ self,
2669
+ data_storage_id: UUID | None = None,
2670
+ ) -> RawFetchResponse | Path | list[Path] | None:
2671
+ """Fetch data from the storage system (sync version).
2672
+
2673
+ Args:
2674
+ data_storage_id: UUID of the data storage entry to fetch
2675
+
2676
+ Returns:
2677
+ For PG_TABLE storage: string content
2678
+ For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
2679
+ For multi-location entries: list of downloaded files
2680
+ None if not found or error occurred
2681
+ """
2682
+ if not data_storage_id:
2683
+ raise DataStorageRetrievalError(
2684
+ "data_storage_id must be provided at this time"
2685
+ )
2686
+
2687
+ try:
2688
+ response = self.client.get(
2689
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
2690
+ )
2691
+ response.raise_for_status()
2692
+ result = DataStorageResponse.model_validate(response.json())
2693
+
2694
+ if len(result.storage_locations) > 1:
2695
+ return [
2696
+ self._download_from_gcs(
2697
+ location.storage_config.signed_url or "",
2698
+ (
2699
+ Path(location.storage_config.location).name
2700
+ if location.storage_config.location
2701
+ else None
2702
+ ),
2703
+ )
2704
+ for location in result.storage_locations
2705
+ ]
2706
+
2707
+ # Most scenarios will only have one location
2708
+ storage_location = result.storage_locations[0]
2709
+ storage_type = storage_location.storage_config.storage_type
2710
+
2711
+ if storage_type == "gcs":
2712
+ if not storage_location.storage_config.signed_url:
2713
+ raise DataStorageRetrievalError(
2714
+ "No signed URL available for GCS download"
2715
+ )
2716
+
2717
+ return self._download_from_gcs(
2718
+ storage_location.storage_config.signed_url,
2719
+ (
2720
+ Path(storage_location.storage_config.location).name
2721
+ if storage_location.storage_config.location
2722
+ else None
2723
+ ),
2724
+ )
2725
+
2726
+ if storage_type in {"raw_content", "pg_table"}:
2727
+ content = result.data_storage.content
2728
+ if content is None:
2729
+ logger.warning(
2730
+ f"No content found for data storage entry {data_storage_id}"
2731
+ )
2732
+ return None
2733
+
2734
+ if result.data_storage.file_path:
2735
+ return RawFetchResponse(
2736
+ filename=Path(result.data_storage.file_path),
2737
+ content=content,
2738
+ entry_id=result.data_storage.id,
2739
+ entry_name=result.data_storage.name,
2740
+ )
2741
+
2742
+ return RawFetchResponse(
2743
+ content=content,
2744
+ entry_id=result.data_storage.id,
2745
+ entry_name=result.data_storage.name,
2746
+ )
2747
+
2748
+ raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
2749
+
2750
+ except HTTPStatusError as e:
2751
+ self._handle_http_errors(e, "retrieving")
2752
+ except Exception as e:
2753
+ raise DataStorageRetrievalError(
2754
+ f"An unexpected error occurred: {e!r}"
2755
+ ) from e
2756
+
2757
+ @retry(
2758
+ stop=stop_after_attempt(3),
2759
+ wait=wait_exponential(multiplier=1, max=10),
2760
+ retry=retry_if_connection_error,
2761
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2762
+ )
2763
+ async def afetch_data_from_storage(
2764
+ self,
2765
+ data_storage_id: UUID | None = None,
2766
+ ) -> RawFetchResponse | Path | list[Path] | None:
2767
+ """Fetch data from the storage system.
2768
+
2769
+ Args:
2770
+ data_storage_id: UUID of the data storage entry to fetch
2771
+
2772
+ Returns:
2773
+ For PG_TABLE storage: string content
2774
+ For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
2775
+ For multi-location entries: list of downloaded files
2776
+ None if not found or error occurred
2777
+ """
2778
+ if not data_storage_id:
2779
+ raise DataStorageRetrievalError(
2780
+ "data_storage_id must be provided at this time"
2781
+ )
2782
+
2783
+ try:
2784
+ response = await self.async_client.get(
2785
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
2786
+ )
2787
+ response.raise_for_status()
2788
+ result = DataStorageResponse.model_validate(response.json())
2789
+
2790
+ if len(result.storage_locations) > 1:
2791
+ return await gather_with_concurrency(
2792
+ DOWNLOAD_CONCURRENCY,
2793
+ [
2794
+ self._adownload_from_gcs(
2795
+ location.storage_config.signed_url or "",
2796
+ (
2797
+ location.storage_config.location.split("/")[-1]
2798
+ if location.storage_config.location
2799
+ else None
2800
+ ),
2801
+ )
2802
+ for location in result.storage_locations
2803
+ ],
2804
+ )
2805
+
2806
+ # Most scenarios will only have one location
2807
+ storage_location = result.storage_locations[0]
2808
+ storage_type = storage_location.storage_config.storage_type
2809
+
2810
+ if storage_type == "gcs":
2811
+ if not storage_location.storage_config.signed_url:
2812
+ raise DataStorageRetrievalError(
2813
+ "No signed URL available for GCS download"
2814
+ )
2815
+
2816
+ return await self._adownload_from_gcs(
2817
+ storage_location.storage_config.signed_url,
2818
+ (
2819
+ storage_location.storage_config.location.split("/")[-1]
2820
+ if storage_location.storage_config.location
2821
+ else None
2822
+ ),
2823
+ )
2824
+
2825
+ if storage_type in {"raw_content", "pg_table"}:
2826
+ content = result.data_storage.content
2827
+ if content is None:
2828
+ logger.warning(
2829
+ f"No content found for data storage entry {data_storage_id}"
2830
+ )
2831
+ return None
2832
+
2833
+ if result.data_storage.file_path:
2834
+ return RawFetchResponse(
2835
+ filename=Path(result.data_storage.file_path),
2836
+ content=content,
2837
+ entry_id=result.data_storage.id,
2838
+ entry_name=result.data_storage.name,
2839
+ )
2840
+
2841
+ return RawFetchResponse(
2842
+ content=content,
2843
+ entry_id=result.data_storage.id,
2844
+ entry_name=result.data_storage.name,
2845
+ )
2846
+
2847
+ raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
2848
+
2849
+ except HTTPStatusError as e:
2850
+ self._handle_http_errors(e, "retrieving")
2851
+ except Exception as e:
2852
+ raise DataStorageRetrievalError(
2853
+ f"An unexpected error occurred: {e!r}"
2854
+ ) from e
2855
+
2856
+ @retry(
2857
+ stop=stop_after_attempt(3),
2858
+ wait=wait_exponential(multiplier=1, max=10),
2859
+ retry=retry_if_connection_error,
2860
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2861
+ )
2862
+ async def acreate_dataset(
2863
+ self,
2864
+ name: str,
2865
+ description: str | None = None,
2866
+ dataset_id: UUID | None = None,
2867
+ ) -> CreateDatasetPayload:
2868
+ """Asynchronously create a new dataset.
2869
+
2870
+ Args:
2871
+ name: Name of the dataset to create
2872
+ description: Optional description of the dataset
2873
+ dataset_id: Optional UUID to assign to the dataset, or None to auto-generate
2874
+
2875
+ Returns:
2876
+ CreateDatasetPayload: A Pydantic model containing:
2877
+ - id - ID of the created dataset (None if auto-generated)
2878
+ - name - Name of the dataset
2879
+ - description - Description of the dataset
2880
+
2881
+ Raises:
2882
+ DataStorageCreationError: If there's an error creating the dataset
2883
+ """
2884
+ try:
2885
+ payload = CreateDatasetPayload(
2886
+ name=name,
2887
+ description=description,
2888
+ id=dataset_id,
2889
+ )
2890
+ response = await self.async_client.post(
2891
+ "/v0.1/data-storage/datasets",
2892
+ json=payload.model_dump(exclude_none=True),
2893
+ )
2894
+ response.raise_for_status()
2895
+ return CreateDatasetPayload.model_validate(response.json())
2896
+ except HTTPStatusError as e:
2897
+ self._handle_http_errors(e, "creating")
2898
+ except Exception as e:
2899
+ raise DataStorageCreationError(
2900
+ f"An unexpected error occurred: {e!r}"
2901
+ ) from e
2902
+
2903
+ @retry(
2904
+ stop=stop_after_attempt(3),
2905
+ wait=wait_exponential(multiplier=1, max=10),
2906
+ retry=retry_if_connection_error,
2907
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2908
+ )
2909
+ def create_dataset(
2910
+ self,
2911
+ name: str,
2912
+ description: str | None = None,
2913
+ dataset_id: UUID | None = None,
2914
+ ) -> CreateDatasetPayload:
2915
+ """Create a new dataset.
2916
+
2917
+ Args:
2918
+ name: Name of the dataset to create
2919
+ description: Optional description of the dataset
2920
+ dataset_id: Optional UUID to assign to the dataset, or None to auto-generate
2921
+
2922
+ Returns:
2923
+ CreateDatasetPayload: A Pydantic model containing:
2924
+ - id - ID of the created dataset (None if auto-generated)
2925
+ - name - Name of the dataset
2926
+ - description - Description of the dataset
2927
+
2928
+ Raises:
2929
+ DataStorageCreationError: If there's an error creating the dataset
2930
+ """
2931
+ try:
2932
+ payload = CreateDatasetPayload(
2933
+ name=name,
2934
+ description=description,
2935
+ id=dataset_id,
2936
+ )
2937
+ response = self.client.post(
2938
+ "/v0.1/data-storage/datasets",
2939
+ json=payload.model_dump(exclude_none=True),
2940
+ )
2941
+ response.raise_for_status()
2942
+ return CreateDatasetPayload.model_validate(response.json())
2943
+ except HTTPStatusError as e:
2944
+ self._handle_http_errors(e, "creating")
2945
+ except Exception as e:
2946
+ raise DataStorageCreationError(
2947
+ f"An unexpected error occurred: {e!r}"
2948
+ ) from e
2949
+
2950
+ @retry(
2951
+ stop=stop_after_attempt(3),
2952
+ wait=wait_exponential(multiplier=1, max=10),
2953
+ retry=retry_if_connection_error,
2954
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2955
+ )
2956
+ async def adelete_dataset(self, dataset_id: UUID):
2957
+ """Delete a dataset.
2958
+
2959
+ Note: This will delete all data storage entries associated with the dataset.
2960
+
2961
+ Args:
2962
+ dataset_id: ID of the dataset to delete
2963
+
2964
+ Raises:
2965
+ DataStorageError: If there's an error deleting the dataset
2966
+ """
2967
+ try:
2968
+ await self.async_client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
2969
+ except HTTPStatusError as e:
2970
+ self._handle_http_errors(e, "deleting")
2971
+ except Exception as e:
2972
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2973
+
2974
+ @retry(
2975
+ stop=stop_after_attempt(3),
2976
+ wait=wait_exponential(multiplier=1, max=10),
2977
+ retry=retry_if_connection_error,
2978
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2979
+ )
2980
+ def delete_dataset(self, dataset_id: UUID):
2981
+ """Delete a dataset.
2982
+
2983
+ Note: This will delete all data storage entries associated with the dataset.
2984
+
2985
+ Args:
2986
+ dataset_id: ID of the dataset to delete
2987
+
2988
+ Raises:
2989
+ DataStorageError: If there's an error deleting the dataset
2990
+ """
2991
+ try:
2992
+ self.client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
2993
+ except HTTPStatusError as e:
2994
+ self._handle_http_errors(e, "deleting")
2995
+ except Exception as e:
2996
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2997
+
2998
+ @retry(
2999
+ stop=stop_after_attempt(3),
3000
+ wait=wait_exponential(multiplier=1, max=10),
3001
+ retry=retry_if_connection_error,
3002
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3003
+ )
3004
+ async def aget_dataset(self, dataset_id: UUID) -> GetDatasetAndEntriesResponse:
3005
+ """Asynchronously retrieve a dataset by ID.
3006
+
3007
+ Args:
3008
+ dataset_id: UUID of the dataset to retrieve
3009
+
3010
+ Returns:
3011
+ GetDatasetAndEntriesResponse: A dict containing:
3012
+ - dataset: DatasetStorage with fields:
3013
+ - id - Unique identifier for the dataset
3014
+ - name - Name of the dataset
3015
+ - user_id - ID of the user who created the dataset
3016
+ - description - Description of the dataset
3017
+ - created_at - Timestamp when the dataset was created
3018
+ - modified_at - Timestamp when the dataset was last modified
3019
+ - data_storage_entries - List of data storage entries in the dataset, each containing:
3020
+ - id - Unique identifier for the data storage entry
3021
+ - name - Name of the data storage entry
3022
+ - description - Description of the data storage entry
3023
+ - content - Content of the data storage entry
3024
+ - embedding - Embedding vector for the content
3025
+ - is_collection - Whether this entry is a collection
3026
+ - tags - List of tags associated with the entry
3027
+ - parent_id - ID of the parent entry for hierarchical storage
3028
+ - project_id - ID of the project this entry belongs to
3029
+ - dataset_id - ID of the dataset this entry belongs to
3030
+ - file_path - Filepath in the storage system where this entry is located
3031
+ - bigquery_schema - Target BigQuery schema for the entry
3032
+ - user_id - ID of the user who created this entry
3033
+ - created_at - Timestamp when the entry was created
3034
+ - modified_at - Timestamp when the entry was last updated
3035
+
3036
+ Raises:
3037
+ DataStorageError: If there's an error retrieving the dataset
3038
+ """
3039
+ try:
3040
+ response = await self.async_client.get(
3041
+ f"/v0.1/data-storage/datasets/{dataset_id}"
3042
+ )
3043
+ response.raise_for_status()
3044
+
3045
+ return GetDatasetAndEntriesResponse.model_validate(response.json())
3046
+ except HTTPStatusError as e:
3047
+ self._handle_http_errors(e, "retrieving")
3048
+ except Exception as e:
3049
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3050
+
3051
+ @retry(
3052
+ stop=stop_after_attempt(3),
3053
+ wait=wait_exponential(multiplier=1, max=10),
3054
+ retry=retry_if_connection_error,
3055
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3056
+ )
3057
+ def get_dataset(self, dataset_id: UUID) -> GetDatasetAndEntriesResponse:
3058
+ """Retrieve a dataset by ID.
3059
+
3060
+ Args:
3061
+ dataset_id: UUID of the dataset to retrieve
3062
+
3063
+ Returns:
3064
+ GetDatasetAndEntriesResponse: A dict containing:
3065
+ - dataset: DatasetStorage with fields:
3066
+ - id - Unique identifier for the dataset
3067
+ - name - Name of the dataset
3068
+ - user_id - ID of the user who created the dataset
3069
+ - description - Description of the dataset
3070
+ - created_at - Timestamp when the dataset was created
3071
+ - modified_at - Timestamp when the dataset was last modified
3072
+ - data_storage_entries - List of data storage entries in the dataset, each containing:
3073
+ - id - Unique identifier for the data storage entry
3074
+ - name - Name of the data storage entry
3075
+ - description - Description of the data storage entry
3076
+ - content - Content of the data storage entry
3077
+ - embedding - Embedding vector for the content
3078
+ - is_collection - Whether this entry is a collection
3079
+ - tags - List of tags associated with the entry
3080
+ - parent_id - ID of the parent entry for hierarchical storage
3081
+ - project_id - ID of the project this entry belongs to
3082
+ - dataset_id - ID of the dataset this entry belongs to
3083
+ - path - Path in the storage system where this entry is located
3084
+ - bigquery_schema - Target BigQuery schema for the entry
3085
+ - user_id - ID of the user who created this entry
3086
+ - created_at - Timestamp when the entry was created
3087
+ - modified_at - Timestamp when the entry was last updated
3088
+
3089
+ Raises:
3090
+ DataStorageError: If there's an error retrieving the dataset
3091
+ """
3092
+ try:
3093
+ response = self.client.get(f"/v0.1/data-storage/datasets/{dataset_id}")
3094
+ response.raise_for_status()
3095
+
3096
+ return GetDatasetAndEntriesResponse.model_validate(response.json())
3097
+ except HTTPStatusError as e:
3098
+ self._handle_http_errors(e, "retrieving")
3099
+ except Exception as e:
3100
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3101
+
3102
+ @retry(
3103
+ stop=stop_after_attempt(3),
3104
+ wait=wait_exponential(multiplier=1, max=10),
3105
+ retry=retry_if_connection_error,
3106
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3107
+ )
3108
+ def get_data_storage_entry(self, data_storage_id: UUID) -> DataStorageResponse:
3109
+ """Get a data storage entry with all details including storage locations and metadata.
3110
+
3111
+ Args:
3112
+ data_storage_id: ID of the data storage entry to retrieve
3113
+
3114
+ Returns:
3115
+ DataStorageResponse with entry details and storage locations
3116
+
3117
+ Raises:
3118
+ DataStorageRetrievalError: If there's an error retrieving the entry
3119
+ """
3120
+ try:
3121
+ response = self.client.get(
3122
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
3123
+ )
3124
+ response.raise_for_status()
3125
+ return DataStorageResponse.model_validate(response.json())
3126
+ except HTTPStatusError as e:
3127
+ self._handle_http_errors(e, "retrieving")
3128
+ except Exception as e:
3129
+ raise DataStorageRetrievalError(
3130
+ f"An unexpected error occurred: {e!r}"
3131
+ ) from e
3132
+
3133
+ @retry(
3134
+ stop=stop_after_attempt(3),
3135
+ wait=wait_exponential(multiplier=1, max=10),
3136
+ retry=retry_if_connection_error,
3137
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3138
+ )
3139
+ async def aget_data_storage_entry(
3140
+ self, data_storage_id: UUID
3141
+ ) -> DataStorageResponse:
3142
+ """Get a data storage entry with all details including storage locations and metadata.
3143
+
3144
+ Args:
3145
+ data_storage_id: ID of the data storage entry to retrieve
3146
+
3147
+ Returns:
3148
+ DataStorageResponse with entry details and storage locations
3149
+
3150
+ Raises:
3151
+ DataStorageRetrievalError: If there's an error retrieving the entry
3152
+ """
3153
+ try:
3154
+ response = await self.async_client.get(
3155
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
3156
+ )
3157
+ response.raise_for_status()
3158
+ return DataStorageResponse.model_validate(response.json())
3159
+ except HTTPStatusError as e:
3160
+ self._handle_http_errors(e, "retrieving")
3161
+ except Exception as e:
3162
+ raise DataStorageRetrievalError(
3163
+ f"An unexpected error occurred: {e!r}"
3164
+ ) from e
3165
+
3166
+ @retry(
3167
+ stop=stop_after_attempt(3),
3168
+ wait=wait_exponential(multiplier=1, max=10),
3169
+ retry=retry_if_connection_error,
3170
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3171
+ )
3172
+ async def adelete_data_storage_entry(self, data_storage_entry_id: UUID) -> None:
3173
+ """Asynchronously delete a data storage entry.
3174
+
3175
+ Args:
3176
+ data_storage_entry_id: UUID of the data storage entry to delete
3177
+
3178
+ Raises:
3179
+ DataStorageError: If there's an error deleting the data storage entry
3180
+ """
3181
+ try:
3182
+ await self.async_client.delete(
3183
+ f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
3184
+ )
3185
+ except HTTPStatusError as e:
3186
+ self._handle_http_errors(e, "deleting")
3187
+ except Exception as e:
3188
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3189
+
3190
+ @retry(
3191
+ stop=stop_after_attempt(3),
3192
+ wait=wait_exponential(multiplier=1, max=10),
3193
+ retry=retry_if_connection_error,
3194
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3195
+ )
3196
+ def delete_data_storage_entry(self, data_storage_entry_id: UUID) -> None:
3197
+ """Delete a data storage entry.
3198
+
3199
+ Args:
3200
+ data_storage_entry_id: UUID of the data storage entry to delete
3201
+
3202
+ Raises:
3203
+ DataStorageError: If there's an error deleting the data storage entry
3204
+ """
3205
+ try:
3206
+ self.client.delete(
3207
+ f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
3208
+ )
3209
+ except HTTPStatusError as e:
3210
+ self._handle_http_errors(e, "deleting")
3211
+ except Exception as e:
3212
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3213
+
3214
+ @retry(
3215
+ stop=stop_after_attempt(3),
3216
+ wait=wait_exponential(multiplier=1, max=10),
3217
+ retry=retry_if_connection_error,
3218
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3219
+ )
3220
+ async def aupdate_entry_permissions(
3221
+ self,
3222
+ data_storage_id: UUID,
3223
+ share_status: ShareStatus,
3224
+ permitted_accessors: PermittedAccessors,
3225
+ ) -> DataStorageResponse:
3226
+ """Update the permissions of a data storage entry.
3227
+
3228
+ Args:
3229
+ data_storage_id: UUID of the data storage entry to update
3230
+ share_status: Share status to set
3231
+ permitted_accessors: Permitted accessors to set
3232
+
3233
+ Returns:
3234
+ DataStorageResponse with updated entry details and storage locations
3235
+
3236
+ Raises:
3237
+ DataStorageError: If there's an error updating the entry permissions
3238
+ """
3239
+ try:
3240
+ response = await self.async_client.patch(
3241
+ f"/v0.1/data-storage/data-entries/{data_storage_id}",
3242
+ json={
3243
+ "share_status": share_status,
3244
+ "permitted_accessors": permitted_accessors.model_dump(),
3245
+ },
3246
+ )
3247
+ response.raise_for_status()
3248
+ return DataStorageResponse.model_validate(response.json())
3249
+ except HTTPStatusError as e:
3250
+ self._handle_http_errors(e, "updating")
3251
+ except Exception as e:
3252
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
3253
+
3254
+ @retry(
3255
+ stop=stop_after_attempt(3),
3256
+ wait=wait_exponential(multiplier=1, max=10),
3257
+ retry=retry_if_connection_error,
3258
+ before_sleep=before_sleep_log(logger, logging.WARNING),
3259
+ )
3260
+ def update_entry_permissions(
3261
+ self,
3262
+ data_storage_id: UUID,
3263
+ share_status: ShareStatus,
3264
+ permitted_accessors: PermittedAccessors,
3265
+ ) -> DataStorageResponse:
3266
+ """Update the permissions of a data storage entry."""
3267
+ try:
3268
+ response = self.client.patch(
3269
+ f"/v0.1/data-storage/data-entries/{data_storage_id}",
3270
+ json={
3271
+ "share_status": share_status,
3272
+ "permitted_accessors": permitted_accessors.model_dump(),
3273
+ },
3274
+ )
3275
+ response.raise_for_status()
3276
+ return DataStorageResponse.model_validate(response.json())
3277
+ except HTTPStatusError as e:
3278
+ self._handle_http_errors(e, "updating")
3279
+ except Exception as e:
3280
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e