futurehouse-client 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2462 @@
1
+ import asyncio
2
+ import fnmatch
3
+ import json
4
+ import logging
5
+ import shutil
6
+ import tempfile
7
+ import zipfile
8
+ from os import PathLike
9
+ from pathlib import Path
10
+ from typing import NoReturn
11
+ from uuid import UUID
12
+
13
+ import aiofiles
14
+ import aiohttp
15
+ import requests as requests_lib
16
+ from google.resumable_media import requests as resumable_requests
17
+ from httpx import AsyncClient, Client, HTTPStatusError, codes
18
+ from lmi.utils import gather_with_concurrency
19
+ from requests.adapters import HTTPAdapter
20
+ from tenacity import (
21
+ before_sleep_log,
22
+ retry,
23
+ stop_after_attempt,
24
+ wait_exponential,
25
+ )
26
+ from tqdm import tqdm
27
+ from urllib3.util.retry import Retry
28
+
29
+ from futurehouse_client.models.data_storage_methods import (
30
+ CreateDatasetPayload,
31
+ DataStorageLocationPayload,
32
+ DataStorageRequestPayload,
33
+ DataStorageResponse,
34
+ DirectoryManifest,
35
+ ManifestEntry,
36
+ )
37
+ from futurehouse_client.models.rest import (
38
+ DataStorageSearchPayload,
39
+ SearchCriterion,
40
+ )
41
+ from futurehouse_client.utils.general import retry_if_connection_error
42
+
43
+ # this is only required if they're using a yaml manifest
44
+ try:
45
+ import yaml
46
+ except ImportError:
47
+ yaml = None # type: ignore[assignment]
48
+
49
+
50
+ logger = logging.getLogger(__name__)
51
+
52
+ # TODO: pdf support, unsure what package we want to use
53
+ SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT = ["txt", "md", "csv", "json", "yaml", "yml"]
54
+ CHUNK_SIZE = 8 * 1024 * 1024 # 8MB
55
+ MAX_RETRIES = 3
56
+ SMALL_FILE_THRESHOLD_BYTES = 10 * 1024 * 1024 # 10MB
57
+ HTTP_RESUME_INCOMPLETE = 308
58
+ INITIATE_HEADERS = {
59
+ "Content-Type": "application/octet-stream",
60
+ "x-goog-resumable": "start",
61
+ "Content-Length": "0",
62
+ }
63
+ DOWNLOAD_CONCURRENCY = 3
64
+
65
+
66
+ def _should_ignore_file(
67
+ file_path: Path | PathLike,
68
+ base_path: Path | PathLike,
69
+ ignore_patterns: list[str] | None = None,
70
+ ) -> bool:
71
+ """Check if a file should be ignored based on ignore patterns.
72
+
73
+ Args:
74
+ file_path: Path to the file to check
75
+ base_path: Base directory path
76
+ ignore_patterns: List of ignore patterns (supports gitignore-style patterns)
77
+
78
+ Returns:
79
+ True if file should be ignored
80
+ """
81
+ if not ignore_patterns:
82
+ return False
83
+
84
+ try:
85
+ file_path = Path(file_path)
86
+ base_path = Path(base_path)
87
+ rel_path = file_path.relative_to(base_path)
88
+ rel_path_str = str(rel_path)
89
+
90
+ for pattern in ignore_patterns:
91
+ pattern = pattern.strip()
92
+ if not pattern or pattern.startswith("#"):
93
+ continue
94
+
95
+ is_absolute_match = pattern.startswith("/") and rel_path_str.startswith(
96
+ pattern[1:]
97
+ )
98
+ is_nested_match = "/" in pattern and pattern in rel_path_str
99
+ is_name_match = fnmatch.fnmatch(file_path.name, pattern)
100
+ is_part_match = pattern in rel_path.parts
101
+
102
+ if is_absolute_match or is_nested_match or is_name_match or is_part_match:
103
+ return True
104
+
105
+ except ValueError:
106
+ pass
107
+
108
+ return False
109
+
110
+
111
+ def _read_ignore_file(dir_path: Path, ignore_filename: str = ".gitignore") -> list[str]:
112
+ """Read ignore patterns from a file in the directory.
113
+
114
+ Args:
115
+ dir_path: Directory to look for ignore file
116
+ ignore_filename: Name of ignore file to read
117
+
118
+ Returns:
119
+ List of ignore patterns
120
+ """
121
+ ignore_file = dir_path / ignore_filename
122
+ if ignore_file.exists():
123
+ try:
124
+ with open(ignore_file, encoding="utf-8") as f:
125
+ return [line.strip() for line in f]
126
+ except Exception as e:
127
+ logger.warning(f"Failed to read {ignore_filename}: {e}")
128
+ return []
129
+ else:
130
+ return []
131
+
132
+
133
+ def _collect_ignore_patterns(
134
+ dir_path: Path,
135
+ ignore_patterns: list[str] | None = None,
136
+ ignore_filename: str = ".gitignore",
137
+ ) -> list[str]:
138
+ """Collect all ignore patterns from multiple sources.
139
+
140
+ Args:
141
+ dir_path: Directory to check for ignore files
142
+ ignore_patterns: Explicit ignore patterns
143
+ ignore_filename: Name of ignore file to read from directory
144
+
145
+ Returns:
146
+ Combined list of ignore patterns
147
+ """
148
+ all_ignore_patterns = ignore_patterns or []
149
+ file_patterns = _read_ignore_file(dir_path, ignore_filename)
150
+ all_ignore_patterns.extend(file_patterns)
151
+
152
+ default_ignores = [".git", "__pycache__", "*.pyc", ".DS_Store", "node_modules"]
153
+ all_ignore_patterns.extend(default_ignores)
154
+
155
+ return all_ignore_patterns
156
+
157
+
158
+ def _create_directory_zip(
159
+ dir_path: Path,
160
+ zip_path: Path,
161
+ ignore_patterns: list[str] | None = None,
162
+ ignore_filename: str = ".gitignore",
163
+ ) -> int:
164
+ """Create a zip file from a directory with ignore patterns.
165
+
166
+ Args:
167
+ dir_path: Directory to zip
168
+ zip_path: Output zip file path
169
+ ignore_patterns: Explicit ignore patterns
170
+ ignore_filename: Name of ignore file to read from directory
171
+
172
+ Returns:
173
+ Size of created zip file in bytes
174
+ """
175
+ all_ignore_patterns = _collect_ignore_patterns(
176
+ dir_path, ignore_patterns, ignore_filename
177
+ )
178
+
179
+ logger.debug(f"Creating zip with ignore patterns: {all_ignore_patterns}")
180
+
181
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
182
+ for file_path in dir_path.rglob("*"):
183
+ if file_path.is_file() and not _should_ignore_file(
184
+ file_path, dir_path, all_ignore_patterns
185
+ ):
186
+ arcname = file_path.relative_to(dir_path)
187
+ zipf.write(file_path, arcname)
188
+ logger.debug(f"Added to zip: {arcname}")
189
+
190
+ zip_size = zip_path.stat().st_size
191
+ logger.debug(f"Created zip file {zip_path} with size {zip_size:,} bytes")
192
+ return zip_size
193
+
194
+
195
+ def _should_send_as_text_content(file_path: Path, file_size: int) -> bool:
196
+ """Check if a file should be sent as text content instead of file upload.
197
+
198
+ Args:
199
+ file_path: Path to the file
200
+ file_size: Size of file in bytes
201
+
202
+ Returns:
203
+ True if file should be sent as text content
204
+ """
205
+ # small files can be treated as raw text
206
+ if file_size >= SMALL_FILE_THRESHOLD_BYTES:
207
+ return False
208
+
209
+ file_extension = file_path.suffix.lower().lstrip(".")
210
+ return file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT
211
+
212
+
213
+ def _extract_text_from_file(file_path: Path) -> str | None:
214
+ """Extract text content from a file.
215
+
216
+ Args:
217
+ file_path: Path to the file
218
+
219
+ Returns:
220
+ Extracted text content or None if extraction failed
221
+ """
222
+ file_extension = file_path.suffix.lower().lstrip(".")
223
+
224
+ if file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT:
225
+ try:
226
+ return file_path.read_text(encoding="utf-8")
227
+ except Exception as e:
228
+ logger.warning(f"Failed to extract text from {file_path}: {e}")
229
+ return None
230
+ else:
231
+ return None
232
+
233
+
234
+ def _setup_upload_progress(file_path: Path, file_size: int, progress_bar: tqdm) -> None:
235
+ """Common setup for upload progress tracking."""
236
+ logger.debug(
237
+ f"Starting resumable upload for file: {file_path} (size: {file_size:,} bytes)"
238
+ )
239
+ progress_bar.set_description(f"Uploading {file_path.name}")
240
+ progress_bar.refresh()
241
+
242
+
243
+ async def _initiate_resumable_session(
244
+ session: aiohttp.ClientSession, signed_url: str
245
+ ) -> str:
246
+ """Initiate resumable upload session and return session URI."""
247
+ logger.debug("Initiating resumable upload session")
248
+ async with session.post(signed_url, headers=INITIATE_HEADERS) as initiate_response:
249
+ if initiate_response.status not in {200, 201}:
250
+ error_text = await initiate_response.text()
251
+ logger.error(
252
+ f"Failed to initiate resumable session: {initiate_response.status}"
253
+ )
254
+ logger.error(f"Response: {error_text}")
255
+ initiate_response.raise_for_status()
256
+
257
+ return _validate_session_uri(initiate_response.headers.get("location"))
258
+
259
+
260
+ # TODO: temp
261
+ def _log_upload_debug(signed_url: str) -> None:
262
+ """Common debug logging for uploads."""
263
+ logger.debug(f"Signed URL: {signed_url[:100]}...")
264
+
265
+
266
+ # TODO: temp
267
+ def _validate_session_uri(session_uri: str | None) -> str:
268
+ """Validate and return session URI or raise exception."""
269
+ if not session_uri:
270
+ raise DataStorageError(
271
+ "No session URI returned from resumable upload initiation"
272
+ )
273
+ logger.debug(f"Resumable session initiated. Session URI: {session_uri[:100]}...")
274
+ return session_uri
275
+
276
+
277
+ async def _upload_chunk_with_retry(
278
+ session: aiohttp.ClientSession,
279
+ session_uri: str,
280
+ chunk_data: bytes,
281
+ range_start: int,
282
+ file_size: int,
283
+ progress_bar: tqdm,
284
+ ) -> int:
285
+ """Upload a single chunk with retry logic."""
286
+ range_end = range_start + len(chunk_data) - 1
287
+ chunk_headers = {
288
+ "Content-Type": "application/octet-stream",
289
+ "Content-Length": str(len(chunk_data)),
290
+ "Content-Range": f"bytes {range_start}-{range_end}/{file_size}",
291
+ }
292
+
293
+ for attempt in range(MAX_RETRIES):
294
+ try:
295
+ async with session.put(
296
+ session_uri, data=chunk_data, headers=chunk_headers
297
+ ) as chunk_response:
298
+ if chunk_response.status == HTTP_RESUME_INCOMPLETE:
299
+ progress_bar.update(len(chunk_data))
300
+ logger.debug(f"Uploaded chunk: {range_end + 1}/{file_size} bytes")
301
+ return len(chunk_data)
302
+ if chunk_response.status in {200, 201}:
303
+ progress_bar.update(len(chunk_data))
304
+ logger.debug(
305
+ f"Upload completed successfully. Final response: {chunk_response.status}"
306
+ )
307
+ return len(chunk_data)
308
+
309
+ error_text = await chunk_response.text()
310
+ logger.warning(
311
+ f"Chunk upload failed (attempt {attempt + 1}/{MAX_RETRIES}): {chunk_response.status}"
312
+ )
313
+ logger.warning(f"Response: {error_text}")
314
+ if attempt == MAX_RETRIES - 1:
315
+ chunk_response.raise_for_status()
316
+
317
+ except (TimeoutError, aiohttp.ClientError) as e:
318
+ logger.warning(
319
+ f"Chunk upload error (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
320
+ )
321
+ if attempt == MAX_RETRIES - 1:
322
+ raise
323
+ await asyncio.sleep(2**attempt)
324
+
325
+ return 0
326
+
327
+
328
+ async def _aupload_file_with_progress(
329
+ signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
330
+ ) -> None:
331
+ """Upload a file asynchronously using aiohttp with signed URL initiation."""
332
+ _setup_upload_progress(file_path, file_size, progress_bar)
333
+ _log_upload_debug(signed_url)
334
+
335
+ try:
336
+ retry_config = aiohttp.ClientTimeout(
337
+ total=max(600.0, file_size / (512 * 1024)), connect=30, sock_read=30
338
+ )
339
+ connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
340
+
341
+ async with aiohttp.ClientSession(
342
+ connector=connector, timeout=retry_config
343
+ ) as session:
344
+ session_uri = await _initiate_resumable_session(session, signed_url)
345
+
346
+ async with aiofiles.open(file_path, "rb") as file_obj:
347
+ bytes_uploaded = 0
348
+
349
+ while bytes_uploaded < file_size:
350
+ remaining = file_size - bytes_uploaded
351
+ current_chunk_size = min(CHUNK_SIZE, remaining)
352
+ chunk_data = await file_obj.read(current_chunk_size)
353
+
354
+ if not chunk_data:
355
+ break
356
+
357
+ uploaded_bytes = await _upload_chunk_with_retry(
358
+ session,
359
+ session_uri,
360
+ chunk_data,
361
+ bytes_uploaded,
362
+ file_size,
363
+ progress_bar,
364
+ )
365
+ bytes_uploaded += uploaded_bytes
366
+
367
+ if bytes_uploaded >= file_size:
368
+ break
369
+
370
+ logger.debug("Upload completed successfully")
371
+
372
+ except Exception as e:
373
+ logger.error(f"Async resumable upload error: {type(e).__name__}: {e}")
374
+ raise
375
+
376
+
377
+ def _upload_file_with_progress(
378
+ signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
379
+ ) -> None:
380
+ """Upload a file synchronously using google.resumable_media with signed URL initiation."""
381
+ _setup_upload_progress(file_path, file_size, progress_bar)
382
+ _log_upload_debug(signed_url)
383
+
384
+ try:
385
+ session = requests_lib.Session()
386
+ retry_strategy = Retry(
387
+ total=MAX_RETRIES,
388
+ backoff_factor=2,
389
+ status_forcelist=[429, 500, 502, 503, 504],
390
+ allowed_methods=["POST", "PUT", "PATCH"],
391
+ )
392
+ adapter = HTTPAdapter(max_retries=retry_strategy)
393
+ session.mount("http://", adapter)
394
+ session.mount("https://", adapter)
395
+
396
+ logger.debug("Initiating resumable upload session")
397
+ initiate_response = session.post(
398
+ signed_url, headers=INITIATE_HEADERS, timeout=30
399
+ )
400
+
401
+ if initiate_response.status_code not in {200, 201}:
402
+ logger.error(
403
+ f"Failed to initiate resumable session: {initiate_response.status_code}"
404
+ )
405
+ logger.error(f"Response: {initiate_response.text}")
406
+ initiate_response.raise_for_status()
407
+
408
+ session_uri = _validate_session_uri(initiate_response.headers.get("location"))
409
+
410
+ with open(file_path, "rb") as file_obj:
411
+ upload = resumable_requests.ResumableUpload(
412
+ upload_url=signed_url, chunk_size=CHUNK_SIZE
413
+ )
414
+
415
+ upload._resumable_url = session_uri
416
+ upload._stream = file_obj
417
+ upload._total_bytes = file_size
418
+
419
+ wrapped_file = ProgressWrapper(file_obj, progress_bar)
420
+ upload._stream = wrapped_file
421
+
422
+ while not upload.finished:
423
+ try:
424
+ upload.transmit_next_chunk(session)
425
+ except Exception as e:
426
+ logger.error(f"Chunk upload failed: {e}")
427
+ raise
428
+
429
+ logger.debug("Upload completed successfully using resumable_media library")
430
+
431
+ except Exception as e:
432
+ logger.error(f"Sync resumable upload error: {type(e).__name__}: {e}")
433
+ raise
434
+
435
+
436
+ class RestClientError(Exception):
437
+ """Base exception for REST client errors."""
438
+
439
+
440
+ class DataStorageError(RestClientError):
441
+ """Base exception for data storage operations."""
442
+
443
+
444
+ class DataStorageCreationError(DataStorageError):
445
+ """Raised when there's an error creating a data storage entry."""
446
+
447
+
448
+ class DataStorageRetrievalError(DataStorageError):
449
+ """Raised when there's an error retrieving a data storage entry."""
450
+
451
+
452
+ class ProgressWrapper:
453
+ """Common progress wrapper for file uploads."""
454
+
455
+ def __init__(self, file_obj, progress_bar):
456
+ self.file_obj = file_obj
457
+ self.progress_bar = progress_bar
458
+ self.bytes_read = 0
459
+
460
+ def read(self, size=-1):
461
+ data = self.file_obj.read(size)
462
+ if data:
463
+ self.bytes_read += len(data)
464
+ current_pos = self.file_obj.tell()
465
+ if current_pos > self.progress_bar.n:
466
+ self.progress_bar.update(current_pos - self.progress_bar.n)
467
+ return data
468
+
469
+ def seek(self, offset, whence=0):
470
+ return self.file_obj.seek(offset, whence)
471
+
472
+ def tell(self):
473
+ return self.file_obj.tell()
474
+
475
+
476
+ class DataStorageMethods: # pylint: disable=too-many-public-methods
477
+ """Data storage methods for RestClient.
478
+
479
+ This class contains methods for interacting with the data storage API endpoints.
480
+ """
481
+
482
+ # needed for mypy `NoReturn`
483
+ def _handle_http_errors(self, e: HTTPStatusError, operation: str) -> NoReturn:
484
+ """Handle common HTTP errors for data storage operations."""
485
+ if e.response.status_code == codes.FORBIDDEN:
486
+ raise DataStorageError(
487
+ f"Error {operation} data storage entry, not authorized"
488
+ ) from e
489
+ if e.response.status_code == codes.UNPROCESSABLE_ENTITY:
490
+ raise DataStorageError(f"Invalid request payload: {e.response.text}") from e
491
+ raise DataStorageError(
492
+ f"Error {operation} data storage entry: {e.response.status_code} - {e.response.text}"
493
+ ) from e
494
+
495
+ def _validate_file_path(self, file_path: str | Path) -> Path:
496
+ """Validate file path exists and return Path object."""
497
+ file_path = Path(file_path)
498
+ if not file_path.exists():
499
+ raise DataStorageError(f"File or directory not found: {file_path}")
500
+ return file_path
501
+
502
+ def _build_zip_path(self, name: str, path: str | None) -> str:
503
+ """Build GCS path for zip file."""
504
+ zip_filename = name if name.endswith(".zip") else f"{name}.zip"
505
+ if path:
506
+ return f"{path.rstrip('/')}/{zip_filename}"
507
+ return zip_filename
508
+
509
+ # TODO: methods in here need to be moved to fh tools
510
+ # =====================================
511
+ def _is_zip_file(self, file_path: Path) -> bool:
512
+ """Check if a file is a zip file by checking its magic bytes."""
513
+ try:
514
+ with open(file_path, "rb") as f:
515
+ magic = f.read(2)
516
+ return magic == b"PK"
517
+ except Exception:
518
+ return False
519
+
520
+ def _extract_zip_file(self, zip_path: Path, extract_to: Path) -> Path:
521
+ """Extract a zip file and return the path to the extracted content.
522
+
523
+ Args:
524
+ zip_path: Path to the zip file
525
+ extract_to: Directory to extract to
526
+
527
+ Returns:
528
+ Path to the extracted content (directory or single file)
529
+ """
530
+ extract_dir = extract_to / "extracted"
531
+ extract_dir.mkdir(exist_ok=True)
532
+
533
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
534
+ zip_ref.extractall(extract_dir)
535
+ extracted_items = list(extract_dir.iterdir())
536
+
537
+ if len(extracted_items) == 1:
538
+ return extracted_items[0]
539
+ return extract_dir
540
+
541
+ async def _adownload_from_gcs(
542
+ self, signed_url: str, file_name: str | None = None
543
+ ) -> Path:
544
+ """Download file from GCS using signed URL and handle unzipping if needed.
545
+
546
+ Args:
547
+ signed_url: The signed URL to download from
548
+ file_name: The name of the file to download
549
+
550
+ Returns:
551
+ Path to the downloaded file (or unzipped directory if it was a zip)
552
+ """
553
+ file_name = file_name or "downloaded_file"
554
+
555
+ try:
556
+ with tempfile.TemporaryDirectory() as temp_dir_str:
557
+ temp_dir = Path(temp_dir_str)
558
+ temp_file = temp_dir / file_name
559
+
560
+ async with self.async_client.stream("GET", signed_url) as response:
561
+ response.raise_for_status()
562
+
563
+ content_disposition = response.headers.get(
564
+ "content-disposition", ""
565
+ )
566
+ filename = file_name
567
+ if "filename=" in content_disposition:
568
+ filename = content_disposition.split("filename=")[-1].strip('"')
569
+
570
+ if filename != file_name:
571
+ temp_file = temp_dir / filename
572
+
573
+ async with aiofiles.open(temp_file, "wb") as f:
574
+ async for chunk in response.aiter_bytes(chunk_size=8192):
575
+ await f.write(chunk)
576
+
577
+ logger.debug(
578
+ f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
579
+ )
580
+
581
+ if self._is_zip_file(temp_file):
582
+ logger.debug(f"File {temp_file} is a zip file, extracting...")
583
+ extracted_path = self._extract_zip_file(temp_file, temp_dir)
584
+
585
+ final_temp_dir = Path(tempfile.mkdtemp())
586
+ final_path = final_temp_dir / extracted_path.name
587
+
588
+ if extracted_path.is_dir():
589
+ shutil.copytree(extracted_path, final_path)
590
+ else:
591
+ shutil.copy2(extracted_path, final_path)
592
+
593
+ return final_path
594
+ final_temp_dir = Path(tempfile.mkdtemp())
595
+ final_file = final_temp_dir / temp_file.name
596
+ shutil.copy2(temp_file, final_file)
597
+ return final_file
598
+
599
+ except Exception as e:
600
+ raise DataStorageError(f"Failed to download from GCS: {e}") from e
601
+
602
+ def _download_from_gcs(self, signed_url: str, file_name: str | None = None) -> Path:
603
+ """Download file from GCS using signed URL and handle unzipping if needed (sync version).
604
+
605
+ Args:
606
+ signed_url: The signed URL to download from
607
+ file_name: The name of the file to download
608
+ Returns:
609
+ Path to the downloaded file (or unzipped directory if it was a zip)
610
+ """
611
+ file_name = file_name or "downloaded_file"
612
+
613
+ try:
614
+ with tempfile.TemporaryDirectory() as temp_dir_str:
615
+ temp_dir = Path(temp_dir_str)
616
+ temp_file = temp_dir / file_name
617
+
618
+ with requests_lib.get(signed_url, stream=True, timeout=30) as response:
619
+ response.raise_for_status()
620
+
621
+ content_disposition = response.headers.get(
622
+ "content-disposition", ""
623
+ )
624
+ filename = file_name
625
+ if "filename=" in content_disposition:
626
+ filename = content_disposition.split("filename=")[-1].strip('"')
627
+
628
+ if filename != file_name:
629
+ temp_file = temp_dir / filename
630
+
631
+ with open(temp_file, "wb") as f:
632
+ for chunk in response.iter_content(chunk_size=8192):
633
+ f.write(chunk)
634
+
635
+ logger.debug(
636
+ f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
637
+ )
638
+
639
+ if self._is_zip_file(temp_file):
640
+ logger.debug(f"File {temp_file} is a zip file, extracting...")
641
+ extracted_path = self._extract_zip_file(temp_file, temp_dir)
642
+
643
+ final_temp_dir = Path(tempfile.mkdtemp())
644
+ final_path = final_temp_dir / extracted_path.name
645
+
646
+ if extracted_path.is_dir():
647
+ shutil.copytree(extracted_path, final_path)
648
+ else:
649
+ shutil.copy2(extracted_path, final_path)
650
+
651
+ return final_path
652
+ final_temp_dir = Path(tempfile.mkdtemp())
653
+ final_file = final_temp_dir / temp_file.name
654
+ shutil.copy2(temp_file, final_file)
655
+ return final_file
656
+
657
+ except Exception as e:
658
+ raise DataStorageError(f"Failed to download from GCS: {e}") from e
659
+
660
+ # =====================================
661
+
662
+ def _prepare_single_file_upload(
663
+ self, name: str, file_path: Path, description: str | None, path: str | None
664
+ ) -> tuple[int, DataStorageRequestPayload | None]:
665
+ """Prepare single file for upload, return file size and payload if text content."""
666
+ file_size = file_path.stat().st_size
667
+ logger.debug(
668
+ f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
669
+ )
670
+
671
+ if _should_send_as_text_content(file_path, file_size):
672
+ logger.debug(
673
+ f"Small text file ({file_size:,} bytes) - sending as text content"
674
+ )
675
+ text_content = _extract_text_from_file(file_path)
676
+ if text_content is not None:
677
+ return file_size, DataStorageRequestPayload(
678
+ name=name,
679
+ description=description,
680
+ content=text_content,
681
+ path=path,
682
+ is_collection=False,
683
+ )
684
+ logger.warning(
685
+ "Could not extract text content, falling back to file upload"
686
+ )
687
+
688
+ return file_size, None
689
+
690
+ def _create_data_storage_entry(
691
+ self, payload: DataStorageRequestPayload
692
+ ) -> DataStorageResponse:
693
+ """Create data storage entry via API (sync version)."""
694
+ response = self.client.post(
695
+ "/v0.1/data-storage/data-entries",
696
+ json=payload.model_dump(mode="json", exclude_none=True),
697
+ )
698
+ response.raise_for_status()
699
+ return DataStorageResponse.model_validate(response.json())
700
+
701
+ async def _acreate_data_storage_entry(
702
+ self, payload: DataStorageRequestPayload
703
+ ) -> DataStorageResponse:
704
+ """Create data storage entry via API (async version)."""
705
+ response = await self.async_client.post(
706
+ "/v0.1/data-storage/data-entries",
707
+ json=payload.model_dump(mode="json", exclude_none=True),
708
+ )
709
+ response.raise_for_status()
710
+ return DataStorageResponse.model_validate(response.json())
711
+
712
+ def _generate_folder_description_from_files(
713
+ self, dir_path: Path, manifest: DirectoryManifest
714
+ ) -> str:
715
+ """Generate folder description by concatenating descriptions of top-level files."""
716
+ descriptions = []
717
+
718
+ # Get top-level files only (not recursive)
719
+ for item in dir_path.iterdir():
720
+ if item.is_file():
721
+ # Try to get description from manifest first
722
+ file_desc = manifest.get_entry_description(item.name)
723
+
724
+ if file_desc:
725
+ descriptions.append(f"{item.name}: {file_desc}")
726
+ else:
727
+ descriptions.append(item.name)
728
+
729
+ if descriptions:
730
+ return f"Directory containing: {', '.join(descriptions)}"
731
+ return f"Directory: {dir_path.name}"
732
+
733
+ def _load_manifest(
734
+ self, dir_path: Path, manifest_filename: str | None
735
+ ) -> DirectoryManifest:
736
+ """Load and parse a manifest file (JSON or YAML) into a structured model."""
737
+ if not manifest_filename:
738
+ return DirectoryManifest()
739
+
740
+ manifest_path = dir_path / manifest_filename
741
+ if not manifest_path.exists():
742
+ logger.error(f"Manifest file not found at {manifest_path}")
743
+ raise DataStorageCreationError(
744
+ f"Manifest file {manifest_filename} not found in directory {dir_path}. Ensure the manifest exists and is correctly named, or do not pass it as an argument."
745
+ )
746
+
747
+ try:
748
+ with open(manifest_path, encoding="utf-8") as f:
749
+ data = {}
750
+ if manifest_filename.lower().endswith(".json"):
751
+ data = json.load(f)
752
+ elif manifest_filename.lower().endswith((".yaml", ".yml")):
753
+ if yaml is None:
754
+ raise ImportError(
755
+ "pyyaml is required to parse .yaml manifest files. "
756
+ "Please install it with `pip install pyyaml`."
757
+ )
758
+ data = yaml.safe_load(f)
759
+ else:
760
+ logger.warning(
761
+ f"Unsupported manifest file extension: {manifest_filename}"
762
+ )
763
+ return DirectoryManifest()
764
+
765
+ return DirectoryManifest.from_dict(data or {})
766
+
767
+ except Exception as e:
768
+ logger.warning(f"Failed to load manifest {manifest_filename}: {e}")
769
+
770
+ return DirectoryManifest()
771
+
772
+ def _upload_data_directory(
773
+ self,
774
+ name: str,
775
+ dir_path: Path,
776
+ description: str | None,
777
+ path: str | None = None,
778
+ ignore_patterns: list[str] | None = None,
779
+ ignore_filename: str = ".gitignore",
780
+ project_id: UUID | None = None,
781
+ ) -> DataStorageResponse:
782
+ """Upload a directory as a single zip file collection.
783
+
784
+ Args:
785
+ name: Name for the directory collection
786
+ dir_path: Path to directory to zip and upload
787
+ description: Description for the collection
788
+ path: Optional GCS path for the zip file
789
+ ignore_patterns: List of patterns to ignore when zipping
790
+ ignore_filename: Name of ignore file to read from directory
791
+ project_id: ID of the project this data storage entry belongs to
792
+
793
+ Returns:
794
+ DataStorageResponse for the uploaded zip file
795
+ """
796
+ logger.debug(f"Uploading directory as zip: {dir_path}")
797
+
798
+ with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
799
+ temp_zip_path = Path(temp_file.name)
800
+
801
+ zip_size = _create_directory_zip(
802
+ dir_path, temp_zip_path, ignore_patterns, ignore_filename
803
+ )
804
+
805
+ zip_gcs_path = self._build_zip_path(name, path)
806
+ payload = DataStorageRequestPayload(
807
+ name=name,
808
+ description=description,
809
+ path=zip_gcs_path,
810
+ is_collection=True,
811
+ project_id=project_id,
812
+ )
813
+
814
+ logger.debug(
815
+ f"Creating data storage entry for zip: {payload.model_dump(exclude_none=True)}"
816
+ )
817
+ data_storage_response = self._create_data_storage_entry(payload)
818
+
819
+ for storage_location in data_storage_response.storage_locations:
820
+ if not storage_location.storage_config.signed_url:
821
+ raise DataStorageCreationError(
822
+ "No signed URL returned for zip upload"
823
+ )
824
+
825
+ with tqdm(
826
+ total=zip_size,
827
+ unit="B",
828
+ unit_scale=True,
829
+ unit_divisor=1024,
830
+ desc=f"Uploading {dir_path.name} (zipped)",
831
+ miniters=1,
832
+ mininterval=0.1,
833
+ ) as pbar:
834
+ _upload_file_with_progress(
835
+ storage_location.storage_config.signed_url,
836
+ temp_zip_path,
837
+ pbar,
838
+ zip_size,
839
+ )
840
+
841
+ status_response = self.client.patch(
842
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
843
+ json={"status": "active"},
844
+ )
845
+ status_response.raise_for_status()
846
+
847
+ logger.debug(
848
+ f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
849
+ )
850
+ return DataStorageResponse.model_validate(status_response.json())
851
+
852
+ async def _aupload_data_directory(
853
+ self,
854
+ name: str,
855
+ dir_path: Path,
856
+ description: str | None,
857
+ path: str | None = None,
858
+ ignore_patterns: list[str] | None = None,
859
+ ignore_filename: str = ".gitignore",
860
+ project_id: UUID | None = None,
861
+ ) -> DataStorageResponse:
862
+ """Asynchronously upload a directory as a single zip file.
863
+
864
+ Args:
865
+ name: Name for the directory collection
866
+ dir_path: Path to directory to zip and upload
867
+ description: Description for the collection
868
+ path: Optional GCS path for the zip file
869
+ ignore_patterns: List of patterns to ignore when zipping
870
+ ignore_filename: Name of ignore file to read from directory
871
+ project_id: ID of the project this data storage entry belongs to
872
+
873
+ Returns:
874
+ DataStorageResponse for the uploaded zip file
875
+ """
876
+ logger.debug(f"Async uploading directory as zip: {dir_path}")
877
+
878
+ with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
879
+ temp_zip_path = Path(temp_file.name)
880
+
881
+ zip_size = _create_directory_zip(
882
+ dir_path, temp_zip_path, ignore_patterns, ignore_filename
883
+ )
884
+
885
+ zip_gcs_path = self._build_zip_path(name, path)
886
+ payload = DataStorageRequestPayload(
887
+ name=name,
888
+ description=description,
889
+ path=zip_gcs_path,
890
+ is_collection=True,
891
+ project_id=project_id,
892
+ )
893
+
894
+ data_storage_response = await self._acreate_data_storage_entry(payload)
895
+
896
+ for storage_location in data_storage_response.storage_locations:
897
+ if not storage_location.storage_config.signed_url:
898
+ raise DataStorageCreationError(
899
+ "No signed URL returned for zip upload"
900
+ )
901
+
902
+ with tqdm(
903
+ total=zip_size,
904
+ unit="B",
905
+ unit_scale=True,
906
+ unit_divisor=1024,
907
+ desc=f"Uploading {dir_path.name} (zipped)",
908
+ miniters=1,
909
+ mininterval=0.1,
910
+ ) as pbar:
911
+ await _aupload_file_with_progress(
912
+ storage_location.storage_config.signed_url,
913
+ temp_zip_path,
914
+ pbar,
915
+ zip_size,
916
+ )
917
+
918
+ status_response = await self.async_client.patch(
919
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
920
+ json={"status": "active"},
921
+ )
922
+ status_response.raise_for_status()
923
+
924
+ logger.debug(
925
+ f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
926
+ )
927
+ return DataStorageResponse.model_validate(status_response.json())
928
+
929
+ def _upload_data_single_file(
930
+ self,
931
+ name: str,
932
+ file_path: Path,
933
+ description: str | None,
934
+ path: str | None = None,
935
+ project_id: UUID | None = None,
936
+ ) -> DataStorageResponse:
937
+ """Upload a single file."""
938
+ file_size = file_path.stat().st_size
939
+ logger.debug(
940
+ f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
941
+ )
942
+
943
+ if _should_send_as_text_content(file_path, file_size):
944
+ logger.debug(
945
+ f"Small text file ({file_size:,} bytes) - sending as text content"
946
+ )
947
+
948
+ text_content = _extract_text_from_file(file_path)
949
+ if text_content is not None:
950
+ payload = DataStorageRequestPayload(
951
+ name=name,
952
+ description=description,
953
+ content=text_content,
954
+ path=path,
955
+ is_collection=False,
956
+ project_id=project_id,
957
+ )
958
+
959
+ logger.debug("Sending file as text content")
960
+ return self._create_data_storage_entry(payload)
961
+ logger.warning(
962
+ "Could not extract text content, falling back to file upload"
963
+ )
964
+
965
+ logger.debug(
966
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
967
+ )
968
+ payload = DataStorageRequestPayload(
969
+ name=name,
970
+ description=description,
971
+ path=path,
972
+ is_collection=False,
973
+ project_id=project_id,
974
+ )
975
+
976
+ logger.debug(
977
+ f"Requesting signed URL with payload: {payload.model_dump(exclude_none=True)}"
978
+ )
979
+
980
+ data_storage_response = self._create_data_storage_entry(payload)
981
+
982
+ for storage_location in data_storage_response.storage_locations:
983
+ if not storage_location.storage_config.signed_url:
984
+ raise DataStorageCreationError("No signed URL returned from server")
985
+
986
+ with tqdm(
987
+ total=file_size,
988
+ unit="B",
989
+ unit_scale=True,
990
+ unit_divisor=1024,
991
+ desc=f"Uploading {file_path.name}",
992
+ miniters=1,
993
+ mininterval=0.1,
994
+ ) as pbar:
995
+ try:
996
+ _upload_file_with_progress(
997
+ storage_location.storage_config.signed_url,
998
+ file_path,
999
+ pbar,
1000
+ file_size,
1001
+ )
1002
+ logger.debug("File upload to signed URL completed successfully")
1003
+ except Exception as e:
1004
+ logger.error(f"Failed to upload file to signed URL: {e}")
1005
+ raise
1006
+
1007
+ logger.debug("Updating data storage status to active")
1008
+ status_response = self.client.patch(
1009
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1010
+ json={"status": "active"},
1011
+ )
1012
+ status_response.raise_for_status()
1013
+ logger.debug("Data storage status updated successfully")
1014
+
1015
+ return DataStorageResponse.model_validate(status_response.json())
1016
+
1017
+ async def _aupload_data_single_file(
1018
+ self,
1019
+ name: str,
1020
+ file_path: Path,
1021
+ description: str | None,
1022
+ path: str | None = None,
1023
+ dataset_id: UUID | None = None,
1024
+ project_id: UUID | None = None,
1025
+ ) -> DataStorageResponse:
1026
+ """Asynchronously upload a single file."""
1027
+ file_size, text_payload = self._prepare_single_file_upload(
1028
+ name, file_path, description, path
1029
+ )
1030
+
1031
+ if text_payload:
1032
+ logger.debug("Sending file as text content")
1033
+ text_payload.dataset_id = dataset_id
1034
+ return await self._acreate_data_storage_entry(text_payload)
1035
+
1036
+ logger.debug(
1037
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1038
+ )
1039
+ payload = DataStorageRequestPayload(
1040
+ name=name,
1041
+ description=description,
1042
+ path=path,
1043
+ is_collection=False,
1044
+ dataset_id=dataset_id,
1045
+ project_id=project_id,
1046
+ )
1047
+
1048
+ data_storage_response = await self._acreate_data_storage_entry(payload)
1049
+
1050
+ for location in data_storage_response.storage_locations:
1051
+ if not location.storage_config.signed_url:
1052
+ raise DataStorageCreationError(
1053
+ f"No signed URL returned from server for location: {location.id}"
1054
+ )
1055
+
1056
+ with tqdm(
1057
+ total=file_size,
1058
+ unit="B",
1059
+ unit_scale=True,
1060
+ unit_divisor=1024,
1061
+ desc=f"Uploading {file_path.name}",
1062
+ miniters=1,
1063
+ mininterval=0.1,
1064
+ leave=False,
1065
+ ) as pbar:
1066
+ await _aupload_file_with_progress(
1067
+ location.storage_config.signed_url, file_path, pbar, file_size
1068
+ )
1069
+
1070
+ status_response = await self.async_client.patch(
1071
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1072
+ json={"status": "active"},
1073
+ )
1074
+ status_response.raise_for_status()
1075
+
1076
+ return DataStorageResponse.model_validate(status_response.json())
1077
+
1078
+ def _upload_data_single_file_with_parent(
1079
+ self,
1080
+ name: str,
1081
+ file_path: Path,
1082
+ description: str | None,
1083
+ path: str | None,
1084
+ parent_id: UUID | None,
1085
+ dataset_id: UUID | None = None,
1086
+ project_id: UUID | None = None,
1087
+ ) -> DataStorageResponse:
1088
+ """Upload a single file with a parent ID (sync version)."""
1089
+ file_size, text_payload = self._prepare_single_file_upload(
1090
+ name, file_path, description, path
1091
+ )
1092
+
1093
+ if text_payload:
1094
+ logger.debug("Sending file as text content with parent_id")
1095
+ text_payload.parent_id = parent_id
1096
+ text_payload.dataset_id = dataset_id
1097
+ text_payload.project_id = project_id
1098
+ return self._create_data_storage_entry(text_payload)
1099
+
1100
+ logger.debug(
1101
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1102
+ )
1103
+ payload = DataStorageRequestPayload(
1104
+ name=name,
1105
+ description=description,
1106
+ path=path,
1107
+ is_collection=False,
1108
+ parent_id=parent_id,
1109
+ dataset_id=dataset_id,
1110
+ project_id=project_id,
1111
+ )
1112
+ data_storage_response = self._create_data_storage_entry(payload)
1113
+
1114
+ for location in data_storage_response.storage_locations:
1115
+ if not location.storage_config.signed_url:
1116
+ raise DataStorageCreationError("No signed URL returned from server")
1117
+
1118
+ with tqdm(
1119
+ total=file_size,
1120
+ unit="B",
1121
+ unit_scale=True,
1122
+ unit_divisor=1024,
1123
+ desc=f"Uploading {file_path.name}",
1124
+ miniters=1,
1125
+ mininterval=0.1,
1126
+ leave=False,
1127
+ ) as pbar:
1128
+ _upload_file_with_progress(
1129
+ location.storage_config.signed_url, file_path, pbar, file_size
1130
+ )
1131
+
1132
+ status_response = self.client.patch(
1133
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1134
+ json={"status": "active"},
1135
+ )
1136
+ status_response.raise_for_status()
1137
+
1138
+ return DataStorageResponse.model_validate(status_response.json())
1139
+
1140
+ def _process_file_item(
1141
+ self,
1142
+ item: Path,
1143
+ dir_manifest: DirectoryManifest,
1144
+ current_parent_id: UUID,
1145
+ dataset_id: UUID | None = None,
1146
+ project_id: UUID | None = None,
1147
+ ) -> DataStorageResponse | None:
1148
+ """Process a single file item for upload."""
1149
+ try:
1150
+ manifest_desc = dir_manifest.get_entry_description(item.name)
1151
+ file_description = manifest_desc or f"File: {item.name}"
1152
+
1153
+ logger.debug(
1154
+ f"Processing file {item.name} with description: '{file_description}'"
1155
+ )
1156
+
1157
+ return self._upload_data_single_file_with_parent(
1158
+ name=item.name,
1159
+ file_path=item,
1160
+ description=file_description,
1161
+ path=None,
1162
+ parent_id=current_parent_id,
1163
+ dataset_id=dataset_id,
1164
+ project_id=project_id,
1165
+ )
1166
+ except Exception as e:
1167
+ logger.error(f"Failed to upload file {item}: {e}")
1168
+ return None
1169
+
1170
+ def _upload_directory_hierarchically(
1171
+ self,
1172
+ name: str,
1173
+ dir_path: Path,
1174
+ description: str | None = None,
1175
+ manifest_filename: str | None = None,
1176
+ parent_id: UUID | None = None,
1177
+ ignore_patterns: list[str] | None = None,
1178
+ ignore_filename: str = ".gitignore",
1179
+ base_dir: Path | None = None,
1180
+ dir_manifest: DirectoryManifest | None = None,
1181
+ dataset_id: UUID | None = None,
1182
+ project_id: UUID | None = None,
1183
+ ) -> list[DataStorageResponse]:
1184
+ """Upload a directory with single dataset and individual file storage entries."""
1185
+ responses = []
1186
+ if parent_id is None:
1187
+ base_dir = dir_path
1188
+ all_ignore_patterns = _collect_ignore_patterns(
1189
+ base_dir, ignore_patterns, ignore_filename
1190
+ )
1191
+
1192
+ payload = DataStorageRequestPayload(
1193
+ name=name,
1194
+ description=description,
1195
+ parent_id=None,
1196
+ dataset_id=None,
1197
+ is_collection=False,
1198
+ project_id=project_id,
1199
+ )
1200
+
1201
+ dir_response = self._create_data_storage_entry(payload)
1202
+ responses.append(dir_response)
1203
+ current_parent_id = dir_response.data_storage.id
1204
+ current_dataset_id = dir_response.data_storage.dataset_id
1205
+
1206
+ dir_manifest = self._load_directory_manifest(
1207
+ manifest_filename, parent_id, dir_path
1208
+ )
1209
+ else:
1210
+ all_ignore_patterns = ignore_patterns or []
1211
+ current_parent_id = parent_id
1212
+ current_dataset_id = dataset_id
1213
+
1214
+ for item in dir_path.iterdir():
1215
+ if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
1216
+ continue
1217
+
1218
+ if item.is_dir():
1219
+ subdir_manifest = DirectoryManifest()
1220
+ if dir_manifest:
1221
+ entry = dir_manifest.entries.get(item.name)
1222
+ if isinstance(entry, DirectoryManifest):
1223
+ subdir_manifest = entry
1224
+ elif isinstance(entry, ManifestEntry):
1225
+ # Convert single entry to manifest
1226
+ subdir_manifest = DirectoryManifest(entries={item.name: entry})
1227
+
1228
+ subdir_description = subdir_manifest.get_entry_description(item.name)
1229
+ if not subdir_description:
1230
+ subdir_description = self._generate_folder_description_from_files(
1231
+ item, subdir_manifest
1232
+ )
1233
+
1234
+ subdir_payload = DataStorageRequestPayload(
1235
+ name=item.name,
1236
+ description=subdir_description,
1237
+ parent_id=current_parent_id,
1238
+ dataset_id=current_dataset_id,
1239
+ is_collection=False,
1240
+ project_id=project_id,
1241
+ )
1242
+ subdir_response = self._create_data_storage_entry(subdir_payload)
1243
+ responses.append(subdir_response)
1244
+
1245
+ subdir_responses = self._upload_directory_hierarchically(
1246
+ name=item.name,
1247
+ dir_path=item,
1248
+ description=None,
1249
+ manifest_filename=None,
1250
+ parent_id=subdir_response.data_storage.id,
1251
+ ignore_patterns=all_ignore_patterns,
1252
+ ignore_filename=ignore_filename,
1253
+ base_dir=base_dir,
1254
+ dir_manifest=subdir_manifest,
1255
+ dataset_id=current_dataset_id,
1256
+ project_id=project_id,
1257
+ )
1258
+ responses.extend(subdir_responses)
1259
+ elif item.is_file():
1260
+ file_response = self._process_file_item(
1261
+ item,
1262
+ dir_manifest or DirectoryManifest(),
1263
+ current_parent_id,
1264
+ current_dataset_id,
1265
+ )
1266
+ if file_response:
1267
+ responses.append(file_response)
1268
+
1269
+ return responses
1270
+
1271
+ def _load_directory_manifest(
1272
+ self,
1273
+ manifest_filename: str | None,
1274
+ parent_id: UUID | None,
1275
+ dir_path: Path,
1276
+ ) -> DirectoryManifest:
1277
+ """Load directory manifest if available."""
1278
+ if manifest_filename and not parent_id:
1279
+ manifest_data = self._load_manifest(Path.cwd(), manifest_filename)
1280
+ dir_name = dir_path.name
1281
+ logger.debug(
1282
+ f"Loaded manifest entries: {list(manifest_data.entries.keys())}"
1283
+ )
1284
+ logger.debug(
1285
+ f"Looking for manifest entry with directory name: '{dir_name}'"
1286
+ )
1287
+
1288
+ entry = manifest_data.entries.get(dir_name)
1289
+ if isinstance(entry, DirectoryManifest):
1290
+ return entry
1291
+ if isinstance(entry, ManifestEntry):
1292
+ return DirectoryManifest(entries={dir_name: entry})
1293
+ logger.debug(
1294
+ f"No manifest entry found for '{dir_name}', available keys: {list(manifest_data.entries.keys())}"
1295
+ )
1296
+ return DirectoryManifest()
1297
+ return DirectoryManifest()
1298
+
1299
+ async def _aupload_data_single_file_with_parent(
1300
+ self,
1301
+ name: str,
1302
+ file_path: Path,
1303
+ description: str | None,
1304
+ path: str | None,
1305
+ parent_id: UUID | None,
1306
+ dataset_id: UUID | None = None,
1307
+ project_id: UUID | None = None,
1308
+ ) -> DataStorageResponse:
1309
+ """Asynchronously upload a single file with a parent ID."""
1310
+ file_size, text_payload = self._prepare_single_file_upload(
1311
+ name, file_path, description, path
1312
+ )
1313
+
1314
+ if text_payload:
1315
+ logger.debug("Sending file as text content with parent_id")
1316
+ text_payload.parent_id = parent_id
1317
+ text_payload.dataset_id = dataset_id
1318
+ text_payload.project_id = project_id
1319
+ return await self._acreate_data_storage_entry(text_payload)
1320
+
1321
+ logger.debug(
1322
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1323
+ )
1324
+ payload = DataStorageRequestPayload(
1325
+ name=name,
1326
+ description=description,
1327
+ path=path,
1328
+ is_collection=False,
1329
+ parent_id=parent_id,
1330
+ dataset_id=dataset_id,
1331
+ project_id=project_id,
1332
+ )
1333
+ data_storage_response = await self._acreate_data_storage_entry(payload)
1334
+
1335
+ storage_location = data_storage_response.storage_locations[0]
1336
+
1337
+ if not storage_location.storage_config.signed_url:
1338
+ raise DataStorageCreationError("No signed URL returned from server")
1339
+
1340
+ with tqdm(
1341
+ total=file_size,
1342
+ unit="B",
1343
+ unit_scale=True,
1344
+ unit_divisor=1024,
1345
+ desc=f"Uploading {file_path.name}",
1346
+ miniters=1,
1347
+ mininterval=0.1,
1348
+ ) as pbar:
1349
+ await _aupload_file_with_progress(
1350
+ storage_location.storage_config.signed_url, file_path, pbar, file_size
1351
+ )
1352
+
1353
+ status_response = await self.async_client.patch(
1354
+ f"/v0.1/data-storage/data-entries/{data_storage_response.data_storage.id}",
1355
+ json={"status": "active"},
1356
+ )
1357
+ status_response.raise_for_status()
1358
+
1359
+ return DataStorageResponse.model_validate(status_response.json())
1360
+
1361
+ async def _aprocess_file_item(
1362
+ self,
1363
+ item: Path,
1364
+ dir_manifest: DirectoryManifest,
1365
+ current_parent_id: UUID,
1366
+ dataset_id: UUID | None = None,
1367
+ project_id: UUID | None = None,
1368
+ ) -> DataStorageResponse | None:
1369
+ """Asynchronously process a single file item for upload."""
1370
+ try:
1371
+ manifest_desc = dir_manifest.get_entry_description(item.name)
1372
+ file_description = manifest_desc or f"File: {item.name}"
1373
+
1374
+ logger.debug(
1375
+ f"Processing file {item.name} with description: '{file_description}'"
1376
+ )
1377
+
1378
+ return await self._aupload_data_single_file_with_parent(
1379
+ name=item.name,
1380
+ file_path=item,
1381
+ description=file_description,
1382
+ path=None,
1383
+ parent_id=current_parent_id,
1384
+ dataset_id=dataset_id,
1385
+ project_id=project_id,
1386
+ )
1387
+ except Exception as e:
1388
+ logger.error(f"Failed to upload file {item}: {e}")
1389
+ return None
1390
+
1391
+ async def _aupload_directory_hierarchically(
1392
+ self,
1393
+ name: str,
1394
+ dir_path: Path,
1395
+ description: str | None = None,
1396
+ manifest_filename: str | None = None,
1397
+ parent_id: UUID | None = None,
1398
+ ignore_patterns: list[str] | None = None,
1399
+ ignore_filename: str = ".gitignore",
1400
+ base_dir: Path | None = None,
1401
+ dir_manifest: DirectoryManifest | None = None,
1402
+ dataset_id: UUID | None = None,
1403
+ project_id: UUID | None = None,
1404
+ ) -> list[DataStorageResponse]:
1405
+ """Upload a directory with single dataset and individual file storage entries (async)."""
1406
+ responses = []
1407
+
1408
+ if parent_id is None:
1409
+ base_dir = dir_path
1410
+ all_ignore_patterns = _collect_ignore_patterns(
1411
+ base_dir, ignore_patterns, ignore_filename
1412
+ )
1413
+
1414
+ payload = DataStorageRequestPayload(
1415
+ name=name,
1416
+ description=description,
1417
+ parent_id=None,
1418
+ dataset_id=None,
1419
+ is_collection=False,
1420
+ project_id=project_id,
1421
+ )
1422
+
1423
+ dir_response = await self._acreate_data_storage_entry(payload)
1424
+ responses.append(dir_response)
1425
+ current_parent_id = dir_response.data_storage.id
1426
+ current_dataset_id = dir_response.data_storage.dataset_id
1427
+
1428
+ dir_manifest = self._load_directory_manifest(
1429
+ manifest_filename, parent_id, dir_path
1430
+ )
1431
+ else:
1432
+ all_ignore_patterns = ignore_patterns or []
1433
+ current_parent_id = parent_id
1434
+ current_dataset_id = dataset_id
1435
+
1436
+ for item in dir_path.iterdir():
1437
+ if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
1438
+ continue
1439
+
1440
+ if item.is_dir():
1441
+ subdir_manifest = DirectoryManifest()
1442
+ if dir_manifest:
1443
+ entry = dir_manifest.entries.get(item.name)
1444
+ if isinstance(entry, DirectoryManifest):
1445
+ subdir_manifest = entry
1446
+ elif isinstance(entry, ManifestEntry):
1447
+ subdir_manifest = DirectoryManifest(entries={item.name: entry})
1448
+
1449
+ subdir_description = subdir_manifest.get_entry_description(item.name)
1450
+ if not subdir_description:
1451
+ subdir_description = self._generate_folder_description_from_files(
1452
+ item, subdir_manifest
1453
+ )
1454
+
1455
+ subdir_payload = DataStorageRequestPayload(
1456
+ name=item.name,
1457
+ description=subdir_description,
1458
+ parent_id=current_parent_id,
1459
+ dataset_id=current_dataset_id,
1460
+ is_collection=False,
1461
+ project_id=project_id,
1462
+ )
1463
+ subdir_response = await self._acreate_data_storage_entry(subdir_payload)
1464
+ responses.append(subdir_response)
1465
+
1466
+ subdir_responses = await self._aupload_directory_hierarchically(
1467
+ name=item.name,
1468
+ dir_path=item,
1469
+ description=None,
1470
+ manifest_filename=None,
1471
+ parent_id=subdir_response.data_storage.id,
1472
+ ignore_patterns=all_ignore_patterns,
1473
+ ignore_filename=ignore_filename,
1474
+ base_dir=base_dir,
1475
+ dir_manifest=subdir_manifest,
1476
+ dataset_id=current_dataset_id,
1477
+ project_id=project_id,
1478
+ )
1479
+ responses.extend(subdir_responses)
1480
+ elif item.is_file():
1481
+ file_response = await self._aprocess_file_item(
1482
+ item,
1483
+ dir_manifest or DirectoryManifest(),
1484
+ current_parent_id,
1485
+ current_dataset_id,
1486
+ )
1487
+ if file_response:
1488
+ responses.append(file_response)
1489
+
1490
+ return responses
1491
+
1492
+ @property
1493
+ def client(self) -> Client:
1494
+ raise NotImplementedError("client property must be implemented by subclass")
1495
+
1496
+ @property
1497
+ def async_client(self) -> AsyncClient:
1498
+ raise NotImplementedError(
1499
+ "async_client property must be implemented by subclass"
1500
+ )
1501
+
1502
+ @retry(
1503
+ stop=stop_after_attempt(3),
1504
+ wait=wait_exponential(multiplier=1, max=10),
1505
+ retry=retry_if_connection_error,
1506
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1507
+ )
1508
+ def store_text_content(
1509
+ self,
1510
+ name: str,
1511
+ content: str,
1512
+ description: str | None = None,
1513
+ path: str | None = None,
1514
+ project_id: UUID | None = None,
1515
+ ) -> DataStorageResponse:
1516
+ """Store content as a string in the data storage system.
1517
+
1518
+ Args:
1519
+ name: Name of the data storage entry
1520
+ content: Content to store as a string
1521
+ description: Optional description of the data storage entry
1522
+ path: Optional path for the data storage entry
1523
+ project_id: ID of the project this data storage entry belongs to
1524
+
1525
+ Returns:
1526
+ DataStorageResponse containing the created data storage entry and storage locations
1527
+
1528
+ Raises:
1529
+ DataStorageCreationError: If there's an error creating the data storage entry
1530
+ """
1531
+ try:
1532
+ payload = DataStorageRequestPayload(
1533
+ name=name,
1534
+ content=content,
1535
+ description=description,
1536
+ path=path,
1537
+ project_id=project_id,
1538
+ )
1539
+ return self._create_data_storage_entry(payload)
1540
+ except HTTPStatusError as e:
1541
+ self._handle_http_errors(e, "creating")
1542
+ except Exception as e:
1543
+ raise DataStorageCreationError(
1544
+ f"An unexpected error occurred: {e!r}"
1545
+ ) from e
1546
+
1547
+ @retry(
1548
+ stop=stop_after_attempt(3),
1549
+ wait=wait_exponential(multiplier=1, max=10),
1550
+ retry=retry_if_connection_error,
1551
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1552
+ )
1553
+ async def astore_text_content(
1554
+ self,
1555
+ name: str,
1556
+ content: str,
1557
+ description: str | None = None,
1558
+ path: str | None = None,
1559
+ dataset_id: UUID | None = None,
1560
+ project_id: UUID | None = None,
1561
+ ) -> DataStorageResponse:
1562
+ """Asynchronously store content as a string in the data storage system.
1563
+
1564
+ Args:
1565
+ name: Name of the data storage entry
1566
+ content: Content to store as a string
1567
+ description: Optional description of the data storage entry
1568
+ path: Optional path for the data storage entry
1569
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1570
+ project_id: ID of the project this data storage entry belongs to
1571
+
1572
+ Returns:
1573
+ DataStorageResponse containing the created data storage entry and storage locations
1574
+
1575
+ Raises:
1576
+ DataStorageCreationError: If there's an error creating the data storage entry
1577
+ """
1578
+ try:
1579
+ payload = DataStorageRequestPayload(
1580
+ name=name,
1581
+ content=content,
1582
+ description=description,
1583
+ path=path,
1584
+ dataset_id=dataset_id,
1585
+ project_id=project_id,
1586
+ )
1587
+ return await self._acreate_data_storage_entry(payload)
1588
+ except HTTPStatusError as e:
1589
+ self._handle_http_errors(e, "creating")
1590
+ except Exception as e:
1591
+ raise DataStorageCreationError(
1592
+ f"An unexpected error occurred: {e!r}"
1593
+ ) from e
1594
+
1595
+ @retry(
1596
+ stop=stop_after_attempt(3),
1597
+ wait=wait_exponential(multiplier=1, max=10),
1598
+ retry=retry_if_connection_error,
1599
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1600
+ )
1601
+ def store_file_content(
1602
+ self,
1603
+ name: str,
1604
+ file_path: str | Path,
1605
+ description: str | None = None,
1606
+ path: str | None = None,
1607
+ as_collection: bool = False,
1608
+ manifest_filename: str | None = None,
1609
+ ignore_patterns: list[str] | None = None,
1610
+ ignore_filename: str = ".gitignore",
1611
+ project_id: UUID | None = None,
1612
+ ) -> DataStorageResponse:
1613
+ """Store file or directory content in the data storage system.
1614
+
1615
+ For files: Small text files (< 10MB, supported formats) are sent as text content,
1616
+ larger/binary files are uploaded via signed URL.
1617
+
1618
+ For directories: Zipped as a single file with ignore pattern support and uploaded
1619
+ as a collection.
1620
+
1621
+ Args:
1622
+ name: Name of the data storage entry
1623
+ file_path: Path to file or directory to upload
1624
+ description: Optional description of the data storage entry
1625
+ path: Optional path for the data storage entry
1626
+ as_collection: If true, upload directories as a single zip file collection.
1627
+ manifest_filename: Name of manifest file
1628
+ ignore_patterns: List of patterns to ignore when zipping directories
1629
+ ignore_filename: Name of ignore file to read from directory (default: .gitignore)
1630
+ project_id: ID of the project this data storage entry belongs to
1631
+
1632
+ Returns:
1633
+ DataStorageResponse containing the final data storage entry
1634
+
1635
+ Raises:
1636
+ DataStorageCreationError: If there's an error in the process
1637
+ """
1638
+ file_path = self._validate_file_path(file_path)
1639
+
1640
+ try:
1641
+ if file_path.is_dir() and as_collection:
1642
+ return self._upload_data_directory(
1643
+ name,
1644
+ file_path,
1645
+ description,
1646
+ path,
1647
+ ignore_patterns,
1648
+ ignore_filename,
1649
+ project_id,
1650
+ )
1651
+ if file_path.is_dir() and not as_collection:
1652
+ responses = self._upload_directory_hierarchically(
1653
+ name=name,
1654
+ dir_path=file_path,
1655
+ description=description,
1656
+ manifest_filename=manifest_filename,
1657
+ ignore_patterns=ignore_patterns,
1658
+ ignore_filename=ignore_filename,
1659
+ project_id=project_id,
1660
+ )
1661
+ if not responses:
1662
+ raise DataStorageCreationError(
1663
+ "No data storage entries were created"
1664
+ )
1665
+ return responses[0]
1666
+ return self._upload_data_single_file(
1667
+ name, file_path, description, path, project_id
1668
+ )
1669
+
1670
+ except HTTPStatusError as e:
1671
+ self._handle_http_errors(e, "creating")
1672
+ except Exception as e:
1673
+ raise DataStorageCreationError(
1674
+ f"An unexpected error occurred during file upload: {e!r}"
1675
+ ) from e
1676
+
1677
+ @retry(
1678
+ stop=stop_after_attempt(3),
1679
+ wait=wait_exponential(multiplier=1, max=10),
1680
+ retry=retry_if_connection_error,
1681
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1682
+ )
1683
+ async def astore_file_content(
1684
+ self,
1685
+ name: str,
1686
+ file_path: str | Path,
1687
+ description: str | None = None,
1688
+ path: str | None = None,
1689
+ as_collection: bool = False,
1690
+ manifest_filename: str | None = None,
1691
+ ignore_patterns: list[str] | None = None,
1692
+ ignore_filename: str = ".gitignore",
1693
+ dataset_id: UUID | None = None,
1694
+ project_id: UUID | None = None,
1695
+ ) -> DataStorageResponse:
1696
+ """Asynchronously store file or directory content in the data storage system.
1697
+
1698
+ Args:
1699
+ name: Name of the data storage entry.
1700
+ file_path: Path to the file or directory to upload.
1701
+ description: Optional description for the entry.
1702
+ path: Optional GCS path for the entry.
1703
+ as_collection: If uploading a directory, `True` zips it into a single collection,
1704
+ `False` uploads it as a hierarchical structure of individual objects.
1705
+ manifest_filename: Optional manifest file for hierarchical uploads.
1706
+ ignore_patterns: List of patterns to ignore when zipping.
1707
+ ignore_filename: Name of ignore file to read (default: .gitignore).
1708
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset.
1709
+ project_id: ID of the project this data storage entry belongs to
1710
+
1711
+ Returns:
1712
+ The `DataStorageResponse` for the created entry. For hierarchical uploads,
1713
+ this is the response for the root directory entry.
1714
+ """
1715
+ file_path = self._validate_file_path(file_path)
1716
+
1717
+ try:
1718
+ if file_path.is_dir():
1719
+ if as_collection:
1720
+ return await self._aupload_data_directory(
1721
+ name,
1722
+ file_path,
1723
+ description,
1724
+ path,
1725
+ ignore_patterns,
1726
+ ignore_filename,
1727
+ project_id,
1728
+ )
1729
+ responses = await self._aupload_directory_hierarchically(
1730
+ name=name,
1731
+ dir_path=file_path,
1732
+ description=description,
1733
+ manifest_filename=manifest_filename,
1734
+ ignore_patterns=ignore_patterns,
1735
+ ignore_filename=ignore_filename,
1736
+ dataset_id=dataset_id,
1737
+ project_id=project_id,
1738
+ )
1739
+ if not responses:
1740
+ raise DataStorageCreationError(
1741
+ "No data storage entries were created"
1742
+ )
1743
+ return responses[0]
1744
+ return await self._aupload_data_single_file(
1745
+ name, file_path, description, path, dataset_id, project_id
1746
+ )
1747
+
1748
+ except HTTPStatusError as e:
1749
+ self._handle_http_errors(e, "creating")
1750
+ except Exception as e:
1751
+ raise DataStorageCreationError(
1752
+ f"An unexpected error occurred during async file upload: {e!r}"
1753
+ ) from e
1754
+
1755
+ @retry(
1756
+ stop=stop_after_attempt(3),
1757
+ wait=wait_exponential(multiplier=1, max=10),
1758
+ retry=retry_if_connection_error,
1759
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1760
+ )
1761
+ def register_existing_data_source(
1762
+ self,
1763
+ name: str,
1764
+ existing_location: DataStorageLocationPayload,
1765
+ description: str | None = None,
1766
+ as_collection: bool = False,
1767
+ path: str | None = None,
1768
+ project_id: UUID | None = None,
1769
+ ) -> DataStorageResponse:
1770
+ """Store content as a string in the data storage system.
1771
+
1772
+ Args:
1773
+ name: Name of the data storage entry
1774
+ existing_location: Describes the existing data source location to register
1775
+ description: Optional description of the data storage entry
1776
+ as_collection: If uploading a directory, `True` creates a single storage entry for
1777
+ the whole directory and multiple storage locations for each file, `False` assumes
1778
+ you are uploading a single file.
1779
+ path: Optional path for the data storage entry
1780
+ project_id: ID of the project this data storage entry belongs to
1781
+
1782
+ Returns:
1783
+ DataStorageResponse containing the created data storage entry and storage locations
1784
+
1785
+ Raises:
1786
+ DataStorageCreationError: If there's an error creating the data storage entry
1787
+ """
1788
+ try:
1789
+ payload = DataStorageRequestPayload(
1790
+ name=name,
1791
+ description=description,
1792
+ path=path,
1793
+ existing_location=existing_location,
1794
+ project_id=project_id,
1795
+ is_collection=as_collection,
1796
+ )
1797
+ response = self.client.post(
1798
+ "/v0.1/data-storage/data-entries",
1799
+ json=payload.model_dump(exclude_none=True),
1800
+ )
1801
+ response.raise_for_status()
1802
+ return DataStorageResponse.model_validate(response.json())
1803
+ except HTTPStatusError as e:
1804
+ self._handle_http_errors(e, "creating")
1805
+ except Exception as e:
1806
+ raise DataStorageCreationError(
1807
+ f"An unexpected error occurred: {e!r}"
1808
+ ) from e
1809
+
1810
+ @retry(
1811
+ stop=stop_after_attempt(3),
1812
+ wait=wait_exponential(multiplier=1, max=10),
1813
+ retry=retry_if_connection_error,
1814
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1815
+ )
1816
+ async def aregister_existing_data_source(
1817
+ self,
1818
+ name: str,
1819
+ existing_location: DataStorageLocationPayload,
1820
+ as_collection: bool = False,
1821
+ description: str | None = None,
1822
+ path: str | None = None,
1823
+ project_id: UUID | None = None,
1824
+ ) -> DataStorageResponse:
1825
+ """Store content as a string in the data storage system.
1826
+
1827
+ Args:
1828
+ name: Name of the data storage entry
1829
+ existing_location: Describes the existing data source location to register
1830
+ description: Optional description of the data storage entry
1831
+ as_collection: If uploading a directory, `True` creates a single storage entry for
1832
+ the whole directory and multiple storage locations for each file, `False` assumes
1833
+ you are uploading a single file.
1834
+ path: Optional path for the data storage entry
1835
+ project_id: ID of the project this data storage entry belongs to
1836
+
1837
+ Returns:
1838
+ DataStorageResponse containing the created data storage entry and storage locations
1839
+
1840
+ Raises:
1841
+ DataStorageCreationError: If there's an error creating the data storage entry
1842
+ """
1843
+ try:
1844
+ payload = DataStorageRequestPayload(
1845
+ name=name,
1846
+ description=description,
1847
+ path=path,
1848
+ existing_location=existing_location,
1849
+ project_id=project_id,
1850
+ is_collection=as_collection,
1851
+ )
1852
+ response = await self.async_client.post(
1853
+ "/v0.1/data-storage/data-entries",
1854
+ json=payload.model_dump(exclude_none=True),
1855
+ )
1856
+ response.raise_for_status()
1857
+ return DataStorageResponse.model_validate(response.json())
1858
+ except HTTPStatusError as e:
1859
+ self._handle_http_errors(e, "creating")
1860
+ except Exception as e:
1861
+ raise DataStorageCreationError(
1862
+ f"An unexpected error occurred: {e!r}"
1863
+ ) from e
1864
+
1865
+ @retry(
1866
+ stop=stop_after_attempt(3),
1867
+ wait=wait_exponential(multiplier=1, max=10),
1868
+ retry=retry_if_connection_error,
1869
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1870
+ )
1871
+ def search_data_storage(
1872
+ self,
1873
+ criteria: list[SearchCriterion] | None = None,
1874
+ size: int = 10,
1875
+ ) -> list[dict]:
1876
+ """Search data storage objects using structured criteria.
1877
+
1878
+ Args:
1879
+ criteria: List of search criteria (SearchCriterion objects with field, operator, value)
1880
+ size: Number of results to return (1-100)
1881
+
1882
+ Returns:
1883
+ List of search results with scores and data storage information
1884
+
1885
+ Raises:
1886
+ DataStorageCreationError: If there's an error searching data storage entries
1887
+
1888
+ Example:
1889
+ from futurehouse_client.models.rest import SearchCriterion, SearchOperator
1890
+ criteria = [
1891
+ SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
1892
+ SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
1893
+ SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
1894
+ ]
1895
+ results = client.search_data_storage(criteria=criteria, size=20)
1896
+ """
1897
+ try:
1898
+ payload = DataStorageSearchPayload(
1899
+ criteria=criteria or [],
1900
+ size=max(1, min(100, size)), # Clamp between 1-100
1901
+ )
1902
+
1903
+ response = self.client.post(
1904
+ "/v0.1/data-storage/search",
1905
+ json=payload.model_dump(mode="json"),
1906
+ )
1907
+ response.raise_for_status()
1908
+ return response.json()
1909
+
1910
+ except HTTPStatusError as e:
1911
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
1912
+ raise DataStorageCreationError(
1913
+ "Search functionality is currently unavailable"
1914
+ ) from e
1915
+ self._handle_http_errors(e, "searching")
1916
+ except Exception as e:
1917
+ raise DataStorageCreationError(
1918
+ f"An unexpected error occurred during search: {e!r}"
1919
+ ) from e
1920
+
1921
+ @retry(
1922
+ stop=stop_after_attempt(3),
1923
+ wait=wait_exponential(multiplier=1, max=10),
1924
+ retry=retry_if_connection_error,
1925
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1926
+ )
1927
+ async def asearch_data_storage(
1928
+ self,
1929
+ criteria: list[SearchCriterion] | None = None,
1930
+ size: int = 10,
1931
+ ) -> list[dict]:
1932
+ """Asynchronously search data storage objects using structured criteria.
1933
+
1934
+ Args:
1935
+ criteria: List of search criteria (SearchCriterion objects with field, operator, value)
1936
+ size: Number of results to return (1-100)
1937
+
1938
+ Returns:
1939
+ List of search results with scores and data storage information
1940
+
1941
+ Raises:
1942
+ DataStorageCreationError: If there's an error searching data storage entries
1943
+
1944
+ Example:
1945
+ from futurehouse_client.models.rest import SearchCriterion, SearchOperator
1946
+ criteria = [
1947
+ SearchCriterion(field="name", operator=SearchOperator.CONTAINS, value="document"),
1948
+ SearchCriterion(field="project_id", operator=SearchOperator.EQUALS, value="my-project-id"),
1949
+ SearchCriterion(field="status", operator=SearchOperator.EQUALS, value="active"),
1950
+ ]
1951
+ results = await client.asearch_data_storage(criteria=criteria, size=20)
1952
+ """
1953
+ try:
1954
+ payload = DataStorageSearchPayload(
1955
+ criteria=criteria or [],
1956
+ size=max(1, min(100, size)), # Clamp between 1-100
1957
+ )
1958
+
1959
+ response = await self.async_client.post(
1960
+ "/v0.1/data-storage/search",
1961
+ json=payload.model_dump(mode="json"),
1962
+ )
1963
+ response.raise_for_status()
1964
+ return response.json()
1965
+
1966
+ except HTTPStatusError as e:
1967
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
1968
+ raise DataStorageCreationError(
1969
+ "Search functionality is currently unavailable"
1970
+ ) from e
1971
+ self._handle_http_errors(e, "searching")
1972
+ except Exception as e:
1973
+ raise DataStorageCreationError(
1974
+ f"An unexpected error occurred during async search: {e!r}"
1975
+ ) from e
1976
+
1977
+ @retry(
1978
+ stop=stop_after_attempt(3),
1979
+ wait=wait_exponential(multiplier=1, max=10),
1980
+ retry=retry_if_connection_error,
1981
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1982
+ )
1983
+ def similarity_search_data_storage(
1984
+ self,
1985
+ embedding: list[float],
1986
+ size: int = 10,
1987
+ min_score: float = 0.7,
1988
+ dataset_id: UUID | None = None,
1989
+ tags: list[str] | None = None,
1990
+ user_id: str | None = None,
1991
+ project_id: str | None = None,
1992
+ ) -> list[dict]:
1993
+ """Search data storage objects using vector similarity.
1994
+
1995
+ Args:
1996
+ embedding: Embedding vector for similarity search
1997
+ size: Number of results to return (1-100)
1998
+ min_score: Minimum similarity score (0.0-1.0)
1999
+ dataset_id: Optional dataset ID filter
2000
+ tags: Optional list of tags to filter by
2001
+ user_id: Optional user ID filter (admin only)
2002
+ project_id: Optional project ID filter
2003
+
2004
+ Returns:
2005
+ List of search results with similarity scores and data storage information
2006
+
2007
+ Raises:
2008
+ DataStorageCreationError: If there's an error performing similarity search
2009
+ """
2010
+ try:
2011
+ # Validate inputs
2012
+ if not embedding:
2013
+ raise DataStorageCreationError("Embedding vector is required")
2014
+
2015
+ if not all(isinstance(x, int | float) for x in embedding):
2016
+ raise DataStorageCreationError("Embedding must be a list of numbers")
2017
+
2018
+ size = max(1, min(100, size)) # Clamp between 1-100
2019
+ min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
2020
+
2021
+ # Build request payload
2022
+ payload = {
2023
+ "embedding": embedding,
2024
+ "size": size,
2025
+ "min_score": min_score,
2026
+ }
2027
+
2028
+ # Add optional filters
2029
+ if dataset_id is not None:
2030
+ payload["dataset_id"] = str(dataset_id)
2031
+ if tags is not None:
2032
+ payload["tags"] = tags
2033
+ if user_id is not None:
2034
+ payload["user_id"] = user_id
2035
+ if project_id is not None:
2036
+ payload["project_id"] = project_id
2037
+
2038
+ response = self.client.post(
2039
+ "/v0.1/data-storage/similarity-search", json=payload
2040
+ )
2041
+ response.raise_for_status()
2042
+ return response.json()
2043
+
2044
+ except HTTPStatusError as e:
2045
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2046
+ raise DataStorageCreationError(
2047
+ "Similarity search functionality is currently unavailable"
2048
+ ) from e
2049
+ self._handle_http_errors(e, "performing similarity search")
2050
+ except Exception as e:
2051
+ raise DataStorageCreationError(
2052
+ f"An unexpected error occurred during similarity search: {e!r}"
2053
+ ) from e
2054
+
2055
+ @retry(
2056
+ stop=stop_after_attempt(3),
2057
+ wait=wait_exponential(multiplier=1, max=10),
2058
+ retry=retry_if_connection_error,
2059
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2060
+ )
2061
+ async def asimilarity_search_data_storage(
2062
+ self,
2063
+ embedding: list[float],
2064
+ size: int = 10,
2065
+ min_score: float = 0.7,
2066
+ dataset_id: UUID | None = None,
2067
+ tags: list[str] | None = None,
2068
+ user_id: str | None = None,
2069
+ project_id: str | None = None,
2070
+ ) -> list[dict]:
2071
+ """Asynchronously search data storage objects using vector similarity.
2072
+
2073
+ Args:
2074
+ embedding: Embedding vector for similarity search
2075
+ size: Number of results to return (1-100)
2076
+ min_score: Minimum similarity score (0.0-1.0)
2077
+ dataset_id: Optional dataset ID filter
2078
+ tags: Optional list of tags to filter by
2079
+ user_id: Optional user ID filter (admin only)
2080
+ project_id: Optional project ID filter
2081
+
2082
+ Returns:
2083
+ List of search results with similarity scores and data storage information
2084
+
2085
+ Raises:
2086
+ DataStorageCreationError: If there's an error performing similarity search
2087
+ """
2088
+ try:
2089
+ # Validate inputs
2090
+ if not embedding:
2091
+ raise DataStorageCreationError("Embedding vector is required")
2092
+
2093
+ if not all(isinstance(x, int | float) for x in embedding):
2094
+ raise DataStorageCreationError("Embedding must be a list of numbers")
2095
+
2096
+ size = max(1, min(100, size)) # Clamp between 1-100
2097
+ min_score = max(0.0, min(1.0, min_score)) # Clamp between 0.0-1.0
2098
+
2099
+ # Build request payload
2100
+ payload = {
2101
+ "embedding": embedding,
2102
+ "size": size,
2103
+ "min_score": min_score,
2104
+ }
2105
+
2106
+ # Add optional filters
2107
+ if dataset_id is not None:
2108
+ payload["dataset_id"] = str(dataset_id)
2109
+ if tags is not None:
2110
+ payload["tags"] = tags
2111
+ if user_id is not None:
2112
+ payload["user_id"] = user_id
2113
+ if project_id is not None:
2114
+ payload["project_id"] = project_id
2115
+
2116
+ response = await self.async_client.post(
2117
+ "/v0.1/data-storage/similarity-search", json=payload
2118
+ )
2119
+ response.raise_for_status()
2120
+ return response.json()
2121
+
2122
+ except HTTPStatusError as e:
2123
+ if e.response.status_code == codes.SERVICE_UNAVAILABLE:
2124
+ raise DataStorageCreationError(
2125
+ "Similarity search functionality is currently unavailable"
2126
+ ) from e
2127
+ self._handle_http_errors(e, "performing similarity search")
2128
+ except Exception as e:
2129
+ raise DataStorageCreationError(
2130
+ f"An unexpected error occurred during async similarity search: {e!r}"
2131
+ ) from e
2132
+
2133
+ # TODO: EVERYTHING BELOW THIS LINE SHOULD BE MOVED TO FH_TOOLS REPO
2134
+ # =================================================
2135
+ @retry(
2136
+ stop=stop_after_attempt(3),
2137
+ wait=wait_exponential(multiplier=1, max=10),
2138
+ retry=retry_if_connection_error,
2139
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2140
+ )
2141
+ def fetch_data_from_storage(
2142
+ self,
2143
+ data_storage_id: UUID | None = None,
2144
+ ) -> str | Path | list[Path] | None:
2145
+ """Fetch data from the storage system (sync version).
2146
+
2147
+ Args:
2148
+ data_storage_id: ID of the data storage entry to fetch
2149
+
2150
+ Returns:
2151
+ For PG_TABLE storage: string content
2152
+ For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
2153
+ For multi-location entries: dict of location IDs to dicts with signed URL and file name
2154
+ None if not found or error occurred
2155
+ """
2156
+ if not data_storage_id:
2157
+ raise DataStorageRetrievalError(
2158
+ "data_storage_id must be provided at this time"
2159
+ )
2160
+
2161
+ try:
2162
+ response = self.client.get(
2163
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
2164
+ )
2165
+ response.raise_for_status()
2166
+ result = DataStorageResponse.model_validate(response.json())
2167
+
2168
+ if len(result.storage_locations) > 1:
2169
+ return [
2170
+ self._download_from_gcs(
2171
+ location.storage_config.signed_url or "",
2172
+ (location.storage_config.location or "").split("/")[-1],
2173
+ )
2174
+ for location in result.storage_locations
2175
+ ]
2176
+
2177
+ # Most scenarios will only have one location
2178
+ storage_location = result.storage_locations[0]
2179
+ storage_type = storage_location.storage_config.storage_type
2180
+
2181
+ if storage_type == "gcs":
2182
+ if not storage_location.storage_config.signed_url:
2183
+ raise DataStorageRetrievalError(
2184
+ "No signed URL available for GCS download"
2185
+ )
2186
+
2187
+ return self._download_from_gcs(
2188
+ storage_location.storage_config.signed_url
2189
+ )
2190
+
2191
+ if storage_type in {"raw_content", "pg_table"}:
2192
+ content = result.data_storage.content
2193
+ if content is None:
2194
+ logger.warning(
2195
+ f"No content found for data storage entry {data_storage_id}"
2196
+ )
2197
+ return None
2198
+ return content
2199
+
2200
+ raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
2201
+
2202
+ except HTTPStatusError as e:
2203
+ self._handle_http_errors(e, "retrieving")
2204
+ except Exception as e:
2205
+ raise DataStorageRetrievalError(
2206
+ f"An unexpected error occurred: {e!r}"
2207
+ ) from e
2208
+
2209
+ @retry(
2210
+ stop=stop_after_attempt(3),
2211
+ wait=wait_exponential(multiplier=1, max=10),
2212
+ retry=retry_if_connection_error,
2213
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2214
+ )
2215
+ async def afetch_data_from_storage(
2216
+ self,
2217
+ data_storage_id: UUID | None = None,
2218
+ ) -> str | Path | list[Path] | None:
2219
+ """Fetch data from the storage system.
2220
+
2221
+ Args:
2222
+ data_storage_id: ID of the data storage entry to fetch
2223
+
2224
+ Returns:
2225
+ For PG_TABLE storage: string content
2226
+ For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
2227
+ For multi-location entries: dict of location IDs to dicts with signed URL and file name
2228
+ None if not found or error occurred
2229
+ """
2230
+ if not data_storage_id:
2231
+ raise DataStorageRetrievalError(
2232
+ "data_storage_id must be provided at this time"
2233
+ )
2234
+
2235
+ try:
2236
+ response = await self.async_client.get(
2237
+ f"/v0.1/data-storage/data-entries/{data_storage_id}", timeout=100
2238
+ )
2239
+ response.raise_for_status()
2240
+ result = DataStorageResponse.model_validate(response.json())
2241
+
2242
+ if len(result.storage_locations) > 1:
2243
+ return await gather_with_concurrency(
2244
+ DOWNLOAD_CONCURRENCY,
2245
+ [
2246
+ self._adownload_from_gcs(
2247
+ location.storage_config.signed_url or "",
2248
+ (location.storage_config.location or "").split("/")[-1],
2249
+ )
2250
+ for location in result.storage_locations
2251
+ ],
2252
+ )
2253
+
2254
+ # Most scenarios will only have one location
2255
+ storage_location = result.storage_locations[0]
2256
+ storage_type = storage_location.storage_config.storage_type
2257
+
2258
+ if storage_type == "gcs":
2259
+ if not storage_location.storage_config.signed_url:
2260
+ raise DataStorageRetrievalError(
2261
+ "No signed URL available for GCS download"
2262
+ )
2263
+
2264
+ return await self._adownload_from_gcs(
2265
+ storage_location.storage_config.signed_url
2266
+ )
2267
+
2268
+ if storage_type in {"raw_content", "pg_table"}:
2269
+ content = result.data_storage.content
2270
+ if content is None:
2271
+ logger.warning(
2272
+ f"No content found for data storage entry {data_storage_id}"
2273
+ )
2274
+ return None
2275
+ return content
2276
+
2277
+ raise DataStorageRetrievalError(f"Unsupported storage type: {storage_type}")
2278
+
2279
+ except HTTPStatusError as e:
2280
+ self._handle_http_errors(e, "retrieving")
2281
+ except Exception as e:
2282
+ raise DataStorageRetrievalError(
2283
+ f"An unexpected error occurred: {e!r}"
2284
+ ) from e
2285
+
2286
+ @retry(
2287
+ stop=stop_after_attempt(3),
2288
+ wait=wait_exponential(multiplier=1, max=10),
2289
+ retry=retry_if_connection_error,
2290
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2291
+ )
2292
+ async def acreate_dataset(
2293
+ self,
2294
+ name: str,
2295
+ description: str | None = None,
2296
+ dataset_id: UUID | None = None,
2297
+ ):
2298
+ try:
2299
+ payload = CreateDatasetPayload(
2300
+ name=name,
2301
+ description=description,
2302
+ id=dataset_id,
2303
+ )
2304
+ response = await self.async_client.post(
2305
+ "/v0.1/data-storage/datasets",
2306
+ json=payload.model_dump(exclude_none=True),
2307
+ )
2308
+ response.raise_for_status()
2309
+ return CreateDatasetPayload.model_validate(response.json())
2310
+ except HTTPStatusError as e:
2311
+ self._handle_http_errors(e, "creating")
2312
+ except Exception as e:
2313
+ raise DataStorageCreationError(
2314
+ f"An unexpected error occurred: {e!r}"
2315
+ ) from e
2316
+
2317
+ @retry(
2318
+ stop=stop_after_attempt(3),
2319
+ wait=wait_exponential(multiplier=1, max=10),
2320
+ retry=retry_if_connection_error,
2321
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2322
+ )
2323
+ def create_dataset(
2324
+ self,
2325
+ name: str,
2326
+ description: str | None = None,
2327
+ dataset_id: UUID | None = None,
2328
+ ):
2329
+ try:
2330
+ payload = CreateDatasetPayload(
2331
+ name=name,
2332
+ description=description,
2333
+ id=dataset_id,
2334
+ )
2335
+ response = self.client.post(
2336
+ "/v0.1/data-storage/datasets",
2337
+ json=payload.model_dump(exclude_none=True),
2338
+ )
2339
+ response.raise_for_status()
2340
+ return CreateDatasetPayload.model_validate(response.json())
2341
+ except HTTPStatusError as e:
2342
+ self._handle_http_errors(e, "creating")
2343
+ except Exception as e:
2344
+ raise DataStorageCreationError(
2345
+ f"An unexpected error occurred: {e!r}"
2346
+ ) from e
2347
+
2348
+ @retry(
2349
+ stop=stop_after_attempt(3),
2350
+ wait=wait_exponential(multiplier=1, max=10),
2351
+ retry=retry_if_connection_error,
2352
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2353
+ )
2354
+ async def adelete_dataset(self, dataset_id: UUID):
2355
+ """Delete a dataset.
2356
+
2357
+ Note: This will delete all data storage entries associated with the dataset.
2358
+
2359
+ Args:
2360
+ dataset_id: ID of the dataset to delete
2361
+
2362
+ Raises:
2363
+ DataStorageError: If there's an error deleting the dataset
2364
+ """
2365
+ try:
2366
+ await self.async_client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
2367
+ except HTTPStatusError as e:
2368
+ self._handle_http_errors(e, "deleting")
2369
+ except Exception as e:
2370
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2371
+
2372
+ @retry(
2373
+ stop=stop_after_attempt(3),
2374
+ wait=wait_exponential(multiplier=1, max=10),
2375
+ retry=retry_if_connection_error,
2376
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2377
+ )
2378
+ def delete_dataset(self, dataset_id: UUID):
2379
+ """Delete a dataset.
2380
+
2381
+ Note: This will delete all data storage entries associated with the dataset.
2382
+
2383
+ Args:
2384
+ dataset_id: ID of the dataset to delete
2385
+
2386
+ Raises:
2387
+ DataStorageError: If there's an error deleting the dataset
2388
+ """
2389
+ try:
2390
+ self.client.delete(f"/v0.1/data-storage/datasets/{dataset_id}")
2391
+ except HTTPStatusError as e:
2392
+ self._handle_http_errors(e, "deleting")
2393
+ except Exception as e:
2394
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2395
+
2396
+ @retry(
2397
+ stop=stop_after_attempt(3),
2398
+ wait=wait_exponential(multiplier=1, max=10),
2399
+ retry=retry_if_connection_error,
2400
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2401
+ )
2402
+ async def aget_dataset(self, dataset_id: UUID):
2403
+ try:
2404
+ response = await self.async_client.get(
2405
+ f"/v0.1/data-storage/datasets/{dataset_id}"
2406
+ )
2407
+ response.raise_for_status()
2408
+
2409
+ return response.json()
2410
+ except HTTPStatusError as e:
2411
+ self._handle_http_errors(e, "retrieving")
2412
+ except Exception as e:
2413
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2414
+
2415
+ @retry(
2416
+ stop=stop_after_attempt(3),
2417
+ wait=wait_exponential(multiplier=1, max=10),
2418
+ retry=retry_if_connection_error,
2419
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2420
+ )
2421
+ def get_dataset(self, dataset_id: UUID):
2422
+ try:
2423
+ response = self.client.get(f"/v0.1/data-storage/datasets/{dataset_id}")
2424
+ response.raise_for_status()
2425
+
2426
+ return response.json()
2427
+ except HTTPStatusError as e:
2428
+ self._handle_http_errors(e, "retrieving")
2429
+ except Exception as e:
2430
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2431
+
2432
+ @retry(
2433
+ stop=stop_after_attempt(3),
2434
+ wait=wait_exponential(multiplier=1, max=10),
2435
+ retry=retry_if_connection_error,
2436
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2437
+ )
2438
+ async def adelete_data_storage_entry(self, data_storage_entry_id: UUID):
2439
+ try:
2440
+ await self.async_client.delete(
2441
+ f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
2442
+ )
2443
+ except HTTPStatusError as e:
2444
+ self._handle_http_errors(e, "deleting")
2445
+ except Exception as e:
2446
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e
2447
+
2448
+ @retry(
2449
+ stop=stop_after_attempt(3),
2450
+ wait=wait_exponential(multiplier=1, max=10),
2451
+ retry=retry_if_connection_error,
2452
+ before_sleep=before_sleep_log(logger, logging.WARNING),
2453
+ )
2454
+ def delete_data_storage_entry(self, data_storage_entry_id: UUID):
2455
+ try:
2456
+ self.client.delete(
2457
+ f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
2458
+ )
2459
+ except HTTPStatusError as e:
2460
+ self._handle_http_errors(e, "deleting")
2461
+ except Exception as e:
2462
+ raise DataStorageError(f"An unexpected error occurred: {e!r}") from e