futurehouse-client 0.4.0__py3-none-any.whl → 0.4.1.dev95__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1876 @@
1
+ import asyncio
2
+ import fnmatch
3
+ import json
4
+ import logging
5
+ import shutil
6
+ import tempfile
7
+ import zipfile
8
+ from os import PathLike
9
+ from pathlib import Path
10
+ from typing import NoReturn
11
+ from uuid import UUID
12
+
13
+ import aiofiles
14
+ import aiohttp
15
+ import requests as requests_lib
16
+ from google.resumable_media import requests as resumable_requests
17
+ from httpx import AsyncClient, Client, HTTPStatusError, codes
18
+ from requests.adapters import HTTPAdapter
19
+ from tenacity import (
20
+ before_sleep_log,
21
+ retry,
22
+ stop_after_attempt,
23
+ wait_exponential,
24
+ )
25
+ from tqdm import tqdm
26
+ from urllib3.util.retry import Retry
27
+
28
+ from futurehouse_client.models.data_storage_methods import (
29
+ DataStorageLocationPayload,
30
+ DataStorageRequestPayload,
31
+ DataStorageResponse,
32
+ DirectoryManifest,
33
+ ManifestEntry,
34
+ )
35
+ from futurehouse_client.utils.general import retry_if_connection_error
36
+
37
+ # this is only required if they're using a yaml manifest
38
+ try:
39
+ import yaml
40
+ except ImportError:
41
+ yaml = None # type: ignore[assignment]
42
+
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+ # TODO: pdf support, unsure what package we want to use
47
+ SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT = ["txt", "md", "csv", "json", "yaml", "yml"]
48
+ CHUNK_SIZE = 8 * 1024 * 1024 # 8MB
49
+ MAX_RETRIES = 3
50
+ SMALL_FILE_THRESHOLD_BYTES = 10 * 1024 * 1024 # 10MB
51
+ HTTP_RESUME_INCOMPLETE = 308
52
+ INITIATE_HEADERS = {
53
+ "Content-Type": "application/octet-stream",
54
+ "x-goog-resumable": "start",
55
+ "Content-Length": "0",
56
+ }
57
+
58
+
59
+ def _should_ignore_file(
60
+ file_path: Path | PathLike,
61
+ base_path: Path | PathLike,
62
+ ignore_patterns: list[str] | None = None,
63
+ ) -> bool:
64
+ """Check if a file should be ignored based on ignore patterns.
65
+
66
+ Args:
67
+ file_path: Path to the file to check
68
+ base_path: Base directory path
69
+ ignore_patterns: List of ignore patterns (supports gitignore-style patterns)
70
+
71
+ Returns:
72
+ True if file should be ignored
73
+ """
74
+ if not ignore_patterns:
75
+ return False
76
+
77
+ try:
78
+ file_path = Path(file_path)
79
+ base_path = Path(base_path)
80
+ rel_path = file_path.relative_to(base_path)
81
+ rel_path_str = str(rel_path)
82
+
83
+ for pattern in ignore_patterns:
84
+ pattern = pattern.strip()
85
+ if not pattern or pattern.startswith("#"):
86
+ continue
87
+
88
+ is_absolute_match = pattern.startswith("/") and rel_path_str.startswith(
89
+ pattern[1:]
90
+ )
91
+ is_nested_match = "/" in pattern and pattern in rel_path_str
92
+ is_name_match = fnmatch.fnmatch(file_path.name, pattern)
93
+ is_part_match = pattern in rel_path.parts
94
+
95
+ if is_absolute_match or is_nested_match or is_name_match or is_part_match:
96
+ return True
97
+
98
+ except ValueError:
99
+ pass
100
+
101
+ return False
102
+
103
+
104
+ def _read_ignore_file(dir_path: Path, ignore_filename: str = ".gitignore") -> list[str]:
105
+ """Read ignore patterns from a file in the directory.
106
+
107
+ Args:
108
+ dir_path: Directory to look for ignore file
109
+ ignore_filename: Name of ignore file to read
110
+
111
+ Returns:
112
+ List of ignore patterns
113
+ """
114
+ ignore_file = dir_path / ignore_filename
115
+ if ignore_file.exists():
116
+ try:
117
+ with open(ignore_file, encoding="utf-8") as f:
118
+ return [line.strip() for line in f]
119
+ except Exception as e:
120
+ logger.warning(f"Failed to read {ignore_filename}: {e}")
121
+ return []
122
+ else:
123
+ return []
124
+
125
+
126
+ def _collect_ignore_patterns(
127
+ dir_path: Path,
128
+ ignore_patterns: list[str] | None = None,
129
+ ignore_filename: str = ".gitignore",
130
+ ) -> list[str]:
131
+ """Collect all ignore patterns from multiple sources.
132
+
133
+ Args:
134
+ dir_path: Directory to check for ignore files
135
+ ignore_patterns: Explicit ignore patterns
136
+ ignore_filename: Name of ignore file to read from directory
137
+
138
+ Returns:
139
+ Combined list of ignore patterns
140
+ """
141
+ all_ignore_patterns = ignore_patterns or []
142
+ file_patterns = _read_ignore_file(dir_path, ignore_filename)
143
+ all_ignore_patterns.extend(file_patterns)
144
+
145
+ default_ignores = [".git", "__pycache__", "*.pyc", ".DS_Store", "node_modules"]
146
+ all_ignore_patterns.extend(default_ignores)
147
+
148
+ return all_ignore_patterns
149
+
150
+
151
+ def _create_directory_zip(
152
+ dir_path: Path,
153
+ zip_path: Path,
154
+ ignore_patterns: list[str] | None = None,
155
+ ignore_filename: str = ".gitignore",
156
+ ) -> int:
157
+ """Create a zip file from a directory with ignore patterns.
158
+
159
+ Args:
160
+ dir_path: Directory to zip
161
+ zip_path: Output zip file path
162
+ ignore_patterns: Explicit ignore patterns
163
+ ignore_filename: Name of ignore file to read from directory
164
+
165
+ Returns:
166
+ Size of created zip file in bytes
167
+ """
168
+ all_ignore_patterns = _collect_ignore_patterns(
169
+ dir_path, ignore_patterns, ignore_filename
170
+ )
171
+
172
+ logger.debug(f"Creating zip with ignore patterns: {all_ignore_patterns}")
173
+
174
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
175
+ for file_path in dir_path.rglob("*"):
176
+ if file_path.is_file() and not _should_ignore_file(
177
+ file_path, dir_path, all_ignore_patterns
178
+ ):
179
+ arcname = file_path.relative_to(dir_path)
180
+ zipf.write(file_path, arcname)
181
+ logger.debug(f"Added to zip: {arcname}")
182
+
183
+ zip_size = zip_path.stat().st_size
184
+ logger.debug(f"Created zip file {zip_path} with size {zip_size:,} bytes")
185
+ return zip_size
186
+
187
+
188
+ def _should_send_as_text_content(file_path: Path, file_size: int) -> bool:
189
+ """Check if a file should be sent as text content instead of file upload.
190
+
191
+ Args:
192
+ file_path: Path to the file
193
+ file_size: Size of file in bytes
194
+
195
+ Returns:
196
+ True if file should be sent as text content
197
+ """
198
+ # small files can be treated as raw text
199
+ if file_size >= SMALL_FILE_THRESHOLD_BYTES:
200
+ return False
201
+
202
+ file_extension = file_path.suffix.lower().lstrip(".")
203
+ return file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT
204
+
205
+
206
+ def _extract_text_from_file(file_path: Path) -> str | None:
207
+ """Extract text content from a file.
208
+
209
+ Args:
210
+ file_path: Path to the file
211
+
212
+ Returns:
213
+ Extracted text content or None if extraction failed
214
+ """
215
+ file_extension = file_path.suffix.lower().lstrip(".")
216
+
217
+ if file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT:
218
+ try:
219
+ return file_path.read_text(encoding="utf-8")
220
+ except Exception as e:
221
+ logger.warning(f"Failed to extract text from {file_path}: {e}")
222
+ return None
223
+ else:
224
+ return None
225
+
226
+
227
+ def _setup_upload_progress(file_path: Path, file_size: int, progress_bar: tqdm) -> None:
228
+ """Common setup for upload progress tracking."""
229
+ logger.debug(
230
+ f"Starting resumable upload for file: {file_path} (size: {file_size:,} bytes)"
231
+ )
232
+ progress_bar.set_description(f"Uploading {file_path.name}")
233
+ progress_bar.refresh()
234
+
235
+
236
+ async def _initiate_resumable_session(
237
+ session: aiohttp.ClientSession, signed_url: str
238
+ ) -> str:
239
+ """Initiate resumable upload session and return session URI."""
240
+ logger.debug("Initiating resumable upload session")
241
+ async with session.post(signed_url, headers=INITIATE_HEADERS) as initiate_response:
242
+ if initiate_response.status not in {200, 201}:
243
+ error_text = await initiate_response.text()
244
+ logger.error(
245
+ f"Failed to initiate resumable session: {initiate_response.status}"
246
+ )
247
+ logger.error(f"Response: {error_text}")
248
+ initiate_response.raise_for_status()
249
+
250
+ return _validate_session_uri(initiate_response.headers.get("location"))
251
+
252
+
253
+ # TODO: temp
254
+ def _log_upload_debug(signed_url: str) -> None:
255
+ """Common debug logging for uploads."""
256
+ logger.debug(f"Signed URL: {signed_url[:100]}...")
257
+
258
+
259
+ # TODO: temp
260
+ def _validate_session_uri(session_uri: str | None) -> str:
261
+ """Validate and return session URI or raise exception."""
262
+ if not session_uri:
263
+ raise DataStorageError(
264
+ "No session URI returned from resumable upload initiation"
265
+ )
266
+ logger.debug(f"Resumable session initiated. Session URI: {session_uri[:100]}...")
267
+ return session_uri
268
+
269
+
270
+ async def _upload_chunk_with_retry(
271
+ session: aiohttp.ClientSession,
272
+ session_uri: str,
273
+ chunk_data: bytes,
274
+ range_start: int,
275
+ file_size: int,
276
+ progress_bar: tqdm,
277
+ ) -> int:
278
+ """Upload a single chunk with retry logic."""
279
+ range_end = range_start + len(chunk_data) - 1
280
+ chunk_headers = {
281
+ "Content-Type": "application/octet-stream",
282
+ "Content-Length": str(len(chunk_data)),
283
+ "Content-Range": f"bytes {range_start}-{range_end}/{file_size}",
284
+ }
285
+
286
+ for attempt in range(MAX_RETRIES):
287
+ try:
288
+ async with session.put(
289
+ session_uri, data=chunk_data, headers=chunk_headers
290
+ ) as chunk_response:
291
+ if chunk_response.status == HTTP_RESUME_INCOMPLETE:
292
+ progress_bar.update(len(chunk_data))
293
+ logger.debug(f"Uploaded chunk: {range_end + 1}/{file_size} bytes")
294
+ return len(chunk_data)
295
+ if chunk_response.status in {200, 201}:
296
+ progress_bar.update(len(chunk_data))
297
+ logger.debug(
298
+ f"Upload completed successfully. Final response: {chunk_response.status}"
299
+ )
300
+ return len(chunk_data)
301
+
302
+ error_text = await chunk_response.text()
303
+ logger.warning(
304
+ f"Chunk upload failed (attempt {attempt + 1}/{MAX_RETRIES}): {chunk_response.status}"
305
+ )
306
+ logger.warning(f"Response: {error_text}")
307
+ if attempt == MAX_RETRIES - 1:
308
+ chunk_response.raise_for_status()
309
+
310
+ except (TimeoutError, aiohttp.ClientError) as e:
311
+ logger.warning(
312
+ f"Chunk upload error (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
313
+ )
314
+ if attempt == MAX_RETRIES - 1:
315
+ raise
316
+ await asyncio.sleep(2**attempt)
317
+
318
+ return 0
319
+
320
+
321
+ async def _aupload_file_with_progress(
322
+ signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
323
+ ) -> None:
324
+ """Upload a file asynchronously using aiohttp with signed URL initiation."""
325
+ _setup_upload_progress(file_path, file_size, progress_bar)
326
+ _log_upload_debug(signed_url)
327
+
328
+ try:
329
+ retry_config = aiohttp.ClientTimeout(
330
+ total=max(600.0, file_size / (512 * 1024)), connect=30, sock_read=30
331
+ )
332
+ connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
333
+
334
+ async with aiohttp.ClientSession(
335
+ connector=connector, timeout=retry_config
336
+ ) as session:
337
+ session_uri = await _initiate_resumable_session(session, signed_url)
338
+
339
+ async with aiofiles.open(file_path, "rb") as file_obj:
340
+ bytes_uploaded = 0
341
+
342
+ while bytes_uploaded < file_size:
343
+ remaining = file_size - bytes_uploaded
344
+ current_chunk_size = min(CHUNK_SIZE, remaining)
345
+ chunk_data = await file_obj.read(current_chunk_size)
346
+
347
+ if not chunk_data:
348
+ break
349
+
350
+ uploaded_bytes = await _upload_chunk_with_retry(
351
+ session,
352
+ session_uri,
353
+ chunk_data,
354
+ bytes_uploaded,
355
+ file_size,
356
+ progress_bar,
357
+ )
358
+ bytes_uploaded += uploaded_bytes
359
+
360
+ if bytes_uploaded >= file_size:
361
+ break
362
+
363
+ logger.debug("Upload completed successfully")
364
+
365
+ except Exception as e:
366
+ logger.error(f"Async resumable upload error: {type(e).__name__}: {e}")
367
+ raise
368
+
369
+
370
+ def _upload_file_with_progress(
371
+ signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
372
+ ) -> None:
373
+ """Upload a file synchronously using google.resumable_media with signed URL initiation."""
374
+ _setup_upload_progress(file_path, file_size, progress_bar)
375
+ _log_upload_debug(signed_url)
376
+
377
+ try:
378
+ session = requests_lib.Session()
379
+ retry_strategy = Retry(
380
+ total=MAX_RETRIES,
381
+ backoff_factor=2,
382
+ status_forcelist=[429, 500, 502, 503, 504],
383
+ allowed_methods=["POST", "PUT", "PATCH"],
384
+ )
385
+ adapter = HTTPAdapter(max_retries=retry_strategy)
386
+ session.mount("http://", adapter)
387
+ session.mount("https://", adapter)
388
+
389
+ logger.debug("Initiating resumable upload session")
390
+ initiate_response = session.post(
391
+ signed_url, headers=INITIATE_HEADERS, timeout=30
392
+ )
393
+
394
+ if initiate_response.status_code not in {200, 201}:
395
+ logger.error(
396
+ f"Failed to initiate resumable session: {initiate_response.status_code}"
397
+ )
398
+ logger.error(f"Response: {initiate_response.text}")
399
+ initiate_response.raise_for_status()
400
+
401
+ session_uri = _validate_session_uri(initiate_response.headers.get("location"))
402
+
403
+ with open(file_path, "rb") as file_obj:
404
+ upload = resumable_requests.ResumableUpload(
405
+ upload_url=signed_url, chunk_size=CHUNK_SIZE
406
+ )
407
+
408
+ upload._resumable_url = session_uri
409
+ upload._stream = file_obj
410
+ upload._total_bytes = file_size
411
+
412
+ wrapped_file = ProgressWrapper(file_obj, progress_bar)
413
+ upload._stream = wrapped_file
414
+
415
+ while not upload.finished:
416
+ try:
417
+ upload.transmit_next_chunk(session)
418
+ except Exception as e:
419
+ logger.error(f"Chunk upload failed: {e}")
420
+ raise
421
+
422
+ logger.debug("Upload completed successfully using resumable_media library")
423
+
424
+ except Exception as e:
425
+ logger.error(f"Sync resumable upload error: {type(e).__name__}: {e}")
426
+ raise
427
+
428
+
429
+ class RestClientError(Exception):
430
+ """Base exception for REST client errors."""
431
+
432
+
433
+ class DataStorageError(RestClientError):
434
+ """Base exception for data storage operations."""
435
+
436
+
437
+ class DataStorageCreationError(DataStorageError):
438
+ """Raised when there's an error creating a data storage entry."""
439
+
440
+
441
+ class ProgressWrapper:
442
+ """Common progress wrapper for file uploads."""
443
+
444
+ def __init__(self, file_obj, progress_bar):
445
+ self.file_obj = file_obj
446
+ self.progress_bar = progress_bar
447
+ self.bytes_read = 0
448
+
449
+ def read(self, size=-1):
450
+ data = self.file_obj.read(size)
451
+ if data:
452
+ self.bytes_read += len(data)
453
+ current_pos = self.file_obj.tell()
454
+ if current_pos > self.progress_bar.n:
455
+ self.progress_bar.update(current_pos - self.progress_bar.n)
456
+ return data
457
+
458
+ def seek(self, offset, whence=0):
459
+ return self.file_obj.seek(offset, whence)
460
+
461
+ def tell(self):
462
+ return self.file_obj.tell()
463
+
464
+
465
+ class DataStorageMethods:
466
+ """Data storage methods for RestClient.
467
+
468
+ This class contains methods for interacting with the data storage API endpoints.
469
+ """
470
+
471
+ # needed for mypy `NoReturn`
472
+ def _handle_http_errors(self, e: HTTPStatusError) -> NoReturn:
473
+ """Handle common HTTP errors for data storage operations."""
474
+ if e.response.status_code == codes.FORBIDDEN:
475
+ raise DataStorageCreationError(
476
+ "Not authorized to create data storage entries"
477
+ ) from e
478
+ if e.response.status_code == codes.UNPROCESSABLE_ENTITY:
479
+ raise DataStorageCreationError(
480
+ f"Invalid request payload: {e.response.text}"
481
+ ) from e
482
+ raise DataStorageCreationError(
483
+ f"Error creating data storage entry: {e.response.status_code} - {e.response.text}"
484
+ ) from e
485
+
486
+ def _validate_file_path(self, file_path: str | Path) -> Path:
487
+ """Validate file path exists and return Path object."""
488
+ file_path = Path(file_path)
489
+ if not file_path.exists():
490
+ raise DataStorageCreationError(f"File or directory not found: {file_path}")
491
+ return file_path
492
+
493
+ def _build_zip_path(self, name: str, path: str | None) -> str:
494
+ """Build GCS path for zip file."""
495
+ zip_filename = name if name.endswith(".zip") else f"{name}.zip"
496
+ if path:
497
+ return f"{path.rstrip('/')}/{zip_filename}"
498
+ return zip_filename
499
+
500
+ # TODO: methods in here need to be moved to fh tools
501
+ # =====================================
502
+ def _is_zip_file(self, file_path: Path) -> bool:
503
+ """Check if a file is a zip file by checking its magic bytes."""
504
+ try:
505
+ with open(file_path, "rb") as f:
506
+ magic = f.read(2)
507
+ return magic == b"PK"
508
+ except Exception:
509
+ return False
510
+
511
+ def _extract_zip_file(self, zip_path: Path, extract_to: Path) -> Path:
512
+ """Extract a zip file and return the path to the extracted content.
513
+
514
+ Args:
515
+ zip_path: Path to the zip file
516
+ extract_to: Directory to extract to
517
+
518
+ Returns:
519
+ Path to the extracted content (directory or single file)
520
+ """
521
+ extract_dir = extract_to / "extracted"
522
+ extract_dir.mkdir(exist_ok=True)
523
+
524
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
525
+ zip_ref.extractall(extract_dir)
526
+ extracted_items = list(extract_dir.iterdir())
527
+
528
+ if len(extracted_items) == 1:
529
+ return extracted_items[0]
530
+ return extract_dir
531
+
532
+ async def _adownload_from_gcs(self, signed_url: str) -> Path:
533
+ """Download file from GCS using signed URL and handle unzipping if needed.
534
+
535
+ Args:
536
+ signed_url: The signed URL to download from
537
+
538
+ Returns:
539
+ Path to the downloaded file (or unzipped directory if it was a zip)
540
+ """
541
+ try:
542
+ with tempfile.TemporaryDirectory() as temp_dir_str:
543
+ temp_dir = Path(temp_dir_str)
544
+ temp_file = temp_dir / "downloaded_file"
545
+
546
+ async with self.async_client.stream("GET", signed_url) as response:
547
+ response.raise_for_status()
548
+
549
+ content_disposition = response.headers.get(
550
+ "content-disposition", ""
551
+ )
552
+ filename = "downloaded_file"
553
+ if "filename=" in content_disposition:
554
+ filename = content_disposition.split("filename=")[-1].strip('"')
555
+
556
+ if filename != "downloaded_file":
557
+ temp_file = temp_dir / filename
558
+
559
+ async with aiofiles.open(temp_file, "wb") as f:
560
+ async for chunk in response.aiter_bytes(chunk_size=8192):
561
+ await f.write(chunk)
562
+
563
+ logger.debug(
564
+ f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
565
+ )
566
+
567
+ if self._is_zip_file(temp_file):
568
+ logger.debug(f"File {temp_file} is a zip file, extracting...")
569
+ extracted_path = self._extract_zip_file(temp_file, temp_dir)
570
+
571
+ final_temp_dir = Path(tempfile.mkdtemp())
572
+ final_path = final_temp_dir / extracted_path.name
573
+
574
+ if extracted_path.is_dir():
575
+ shutil.copytree(extracted_path, final_path)
576
+ else:
577
+ shutil.copy2(extracted_path, final_path)
578
+
579
+ return final_path
580
+ final_temp_dir = Path(tempfile.mkdtemp())
581
+ final_file = final_temp_dir / temp_file.name
582
+ shutil.copy2(temp_file, final_file)
583
+ return final_file
584
+
585
+ except Exception as e:
586
+ raise DataStorageCreationError(f"Failed to download from GCS: {e}") from e
587
+
588
+ def _download_from_gcs(self, signed_url: str) -> Path:
589
+ """Download file from GCS using signed URL and handle unzipping if needed (sync version).
590
+
591
+ Args:
592
+ signed_url: The signed URL to download from
593
+
594
+ Returns:
595
+ Path to the downloaded file (or unzipped directory if it was a zip)
596
+ """
597
+ try:
598
+ with tempfile.TemporaryDirectory() as temp_dir_str:
599
+ temp_dir = Path(temp_dir_str)
600
+ temp_file = temp_dir / "downloaded_file"
601
+
602
+ with requests_lib.get(signed_url, stream=True, timeout=30) as response:
603
+ response.raise_for_status()
604
+
605
+ content_disposition = response.headers.get(
606
+ "content-disposition", ""
607
+ )
608
+ filename = "downloaded_file"
609
+ if "filename=" in content_disposition:
610
+ filename = content_disposition.split("filename=")[-1].strip('"')
611
+
612
+ if filename != "downloaded_file":
613
+ temp_file = temp_dir / filename
614
+
615
+ with open(temp_file, "wb") as f:
616
+ for chunk in response.iter_content(chunk_size=8192):
617
+ f.write(chunk)
618
+
619
+ logger.debug(
620
+ f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
621
+ )
622
+
623
+ if self._is_zip_file(temp_file):
624
+ logger.debug(f"File {temp_file} is a zip file, extracting...")
625
+ extracted_path = self._extract_zip_file(temp_file, temp_dir)
626
+
627
+ final_temp_dir = Path(tempfile.mkdtemp())
628
+ final_path = final_temp_dir / extracted_path.name
629
+
630
+ if extracted_path.is_dir():
631
+ shutil.copytree(extracted_path, final_path)
632
+ else:
633
+ shutil.copy2(extracted_path, final_path)
634
+
635
+ return final_path
636
+ final_temp_dir = Path(tempfile.mkdtemp())
637
+ final_file = final_temp_dir / temp_file.name
638
+ shutil.copy2(temp_file, final_file)
639
+ return final_file
640
+
641
+ except Exception as e:
642
+ raise DataStorageCreationError(f"Failed to download from GCS: {e}") from e
643
+
644
+ # =====================================
645
+
646
+ def _prepare_single_file_upload(
647
+ self, name: str, file_path: Path, description: str | None, path: str | None
648
+ ) -> tuple[int, DataStorageRequestPayload | None]:
649
+ """Prepare single file for upload, return file size and payload if text content."""
650
+ file_size = file_path.stat().st_size
651
+ logger.debug(
652
+ f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
653
+ )
654
+
655
+ if _should_send_as_text_content(file_path, file_size):
656
+ logger.debug(
657
+ f"Small text file ({file_size:,} bytes) - sending as text content"
658
+ )
659
+ text_content = _extract_text_from_file(file_path)
660
+ if text_content is not None:
661
+ return file_size, DataStorageRequestPayload(
662
+ name=name,
663
+ description=description,
664
+ content=text_content,
665
+ path=path,
666
+ is_collection=False,
667
+ )
668
+ logger.warning(
669
+ "Could not extract text content, falling back to file upload"
670
+ )
671
+
672
+ return file_size, None
673
+
674
+ def _create_data_storage_entry(
675
+ self, payload: DataStorageRequestPayload
676
+ ) -> DataStorageResponse:
677
+ """Create data storage entry via API (sync version)."""
678
+ response = self.client.post(
679
+ "/v0.1/data-storage",
680
+ json=payload.model_dump(mode="json", exclude_none=True),
681
+ )
682
+ response.raise_for_status()
683
+ return DataStorageResponse.model_validate(response.json())
684
+
685
+ async def _acreate_data_storage_entry(
686
+ self, payload: DataStorageRequestPayload
687
+ ) -> DataStorageResponse:
688
+ """Create data storage entry via API (async version)."""
689
+ response = await self.async_client.post(
690
+ "/v0.1/data-storage",
691
+ json=payload.model_dump(mode="json", exclude_none=True),
692
+ )
693
+ response.raise_for_status()
694
+ return DataStorageResponse.model_validate(response.json())
695
+
696
+ def _generate_folder_description_from_files(
697
+ self, dir_path: Path, manifest: DirectoryManifest
698
+ ) -> str:
699
+ """Generate folder description by concatenating descriptions of top-level files."""
700
+ descriptions = []
701
+
702
+ # Get top-level files only (not recursive)
703
+ for item in dir_path.iterdir():
704
+ if item.is_file():
705
+ # Try to get description from manifest first
706
+ file_desc = manifest.get_entry_description(item.name)
707
+
708
+ if file_desc:
709
+ descriptions.append(f"{item.name}: {file_desc}")
710
+ else:
711
+ descriptions.append(item.name)
712
+
713
+ if descriptions:
714
+ return f"Directory containing: {', '.join(descriptions)}"
715
+ return f"Directory: {dir_path.name}"
716
+
717
+ def _load_manifest(
718
+ self, dir_path: Path, manifest_filename: str | None
719
+ ) -> DirectoryManifest:
720
+ """Load and parse a manifest file (JSON or YAML) into a structured model."""
721
+ if not manifest_filename:
722
+ return DirectoryManifest()
723
+
724
+ manifest_path = dir_path / manifest_filename
725
+ if not manifest_path.exists():
726
+ logger.error(f"Manifest file not found at {manifest_path}")
727
+ raise DataStorageCreationError(
728
+ f"Manifest file {manifest_filename} not found in directory {dir_path}. Ensure the manifest exists and is correctly named, or do not pass it as an argument."
729
+ )
730
+
731
+ try:
732
+ with open(manifest_path, encoding="utf-8") as f:
733
+ data = {}
734
+ if manifest_filename.lower().endswith(".json"):
735
+ data = json.load(f)
736
+ elif manifest_filename.lower().endswith((".yaml", ".yml")):
737
+ if yaml is None:
738
+ raise ImportError(
739
+ "pyyaml is required to parse .yaml manifest files. "
740
+ "Please install it with `pip install pyyaml`."
741
+ )
742
+ data = yaml.safe_load(f)
743
+ else:
744
+ logger.warning(
745
+ f"Unsupported manifest file extension: {manifest_filename}"
746
+ )
747
+ return DirectoryManifest()
748
+
749
+ return DirectoryManifest.from_dict(data or {})
750
+
751
+ except Exception as e:
752
+ logger.warning(f"Failed to load manifest {manifest_filename}: {e}")
753
+
754
+ return DirectoryManifest()
755
+
756
+ def _upload_data_directory(
757
+ self,
758
+ name: str,
759
+ dir_path: Path,
760
+ description: str | None,
761
+ path: str | None = None,
762
+ ignore_patterns: list[str] | None = None,
763
+ ignore_filename: str = ".gitignore",
764
+ ) -> DataStorageResponse:
765
+ """Upload a directory as a single zip file collection.
766
+
767
+ Args:
768
+ name: Name for the directory collection
769
+ dir_path: Path to directory to zip and upload
770
+ description: Description for the collection
771
+ path: Optional GCS path for the zip file
772
+ ignore_patterns: List of patterns to ignore when zipping
773
+ ignore_filename: Name of ignore file to read from directory
774
+
775
+ Returns:
776
+ DataStorageResponse for the uploaded zip file
777
+ """
778
+ logger.debug(f"Uploading directory as zip: {dir_path}")
779
+
780
+ with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
781
+ temp_zip_path = Path(temp_file.name)
782
+
783
+ zip_size = _create_directory_zip(
784
+ dir_path, temp_zip_path, ignore_patterns, ignore_filename
785
+ )
786
+
787
+ zip_gcs_path = self._build_zip_path(name, path)
788
+ payload = DataStorageRequestPayload(
789
+ name=name,
790
+ description=description,
791
+ path=zip_gcs_path,
792
+ is_collection=True,
793
+ )
794
+
795
+ logger.debug(
796
+ f"Creating data storage entry for zip: {payload.model_dump(exclude_none=True)}"
797
+ )
798
+ data_storage_response = self._create_data_storage_entry(payload)
799
+
800
+ if not data_storage_response.signed_url:
801
+ raise DataStorageCreationError("No signed URL returned for zip upload")
802
+
803
+ with tqdm(
804
+ total=zip_size,
805
+ unit="B",
806
+ unit_scale=True,
807
+ unit_divisor=1024,
808
+ desc=f"Uploading {dir_path.name} (zipped)",
809
+ miniters=1,
810
+ mininterval=0.1,
811
+ ) as pbar:
812
+ _upload_file_with_progress(
813
+ data_storage_response.signed_url, temp_zip_path, pbar, zip_size
814
+ )
815
+
816
+ status_response = self.client.patch(
817
+ f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
818
+ json={"status": "active"},
819
+ )
820
+ status_response.raise_for_status()
821
+
822
+ logger.debug(
823
+ f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
824
+ )
825
+ return DataStorageResponse.model_validate(status_response.json())
826
+
827
+ async def _aupload_data_directory(
828
+ self,
829
+ name: str,
830
+ dir_path: Path,
831
+ description: str | None,
832
+ path: str | None = None,
833
+ ignore_patterns: list[str] | None = None,
834
+ ignore_filename: str = ".gitignore",
835
+ ) -> DataStorageResponse:
836
+ """Asynchronously upload a directory as a single zip file.
837
+
838
+ Args:
839
+ name: Name for the directory collection
840
+ dir_path: Path to directory to zip and upload
841
+ description: Description for the collection
842
+ path: Optional GCS path for the zip file
843
+ ignore_patterns: List of patterns to ignore when zipping
844
+ ignore_filename: Name of ignore file to read from directory
845
+
846
+ Returns:
847
+ DataStorageResponse for the uploaded zip file
848
+ """
849
+ logger.debug(f"Async uploading directory as zip: {dir_path}")
850
+
851
+ with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
852
+ temp_zip_path = Path(temp_file.name)
853
+
854
+ zip_size = _create_directory_zip(
855
+ dir_path, temp_zip_path, ignore_patterns, ignore_filename
856
+ )
857
+
858
+ zip_gcs_path = self._build_zip_path(name, path)
859
+ payload = DataStorageRequestPayload(
860
+ name=name,
861
+ description=description,
862
+ path=zip_gcs_path,
863
+ is_collection=True,
864
+ )
865
+
866
+ data_storage_response = await self._acreate_data_storage_entry(payload)
867
+
868
+ if not data_storage_response.signed_url:
869
+ raise DataStorageCreationError("No signed URL returned for zip upload")
870
+
871
+ with tqdm(
872
+ total=zip_size,
873
+ unit="B",
874
+ unit_scale=True,
875
+ unit_divisor=1024,
876
+ desc=f"Uploading {dir_path.name} (zipped)",
877
+ miniters=1,
878
+ mininterval=0.1,
879
+ ) as pbar:
880
+ await _aupload_file_with_progress(
881
+ data_storage_response.signed_url, temp_zip_path, pbar, zip_size
882
+ )
883
+
884
+ status_response = await self.async_client.patch(
885
+ f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
886
+ json={"status": "active"},
887
+ )
888
+ status_response.raise_for_status()
889
+
890
+ logger.debug(
891
+ f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
892
+ )
893
+ return DataStorageResponse.model_validate(status_response.json())
894
+
895
+ def _upload_data_single_file(
896
+ self,
897
+ name: str,
898
+ file_path: Path,
899
+ description: str | None,
900
+ path: str | None = None,
901
+ ) -> DataStorageResponse:
902
+ """Upload a single file."""
903
+ file_size = file_path.stat().st_size
904
+ logger.debug(
905
+ f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
906
+ )
907
+
908
+ if _should_send_as_text_content(file_path, file_size):
909
+ logger.debug(
910
+ f"Small text file ({file_size:,} bytes) - sending as text content"
911
+ )
912
+
913
+ text_content = _extract_text_from_file(file_path)
914
+ if text_content is not None:
915
+ payload = DataStorageRequestPayload(
916
+ name=name,
917
+ description=description,
918
+ content=text_content,
919
+ path=path,
920
+ is_collection=False,
921
+ )
922
+
923
+ logger.debug("Sending file as text content")
924
+ return self._create_data_storage_entry(payload)
925
+ logger.warning(
926
+ "Could not extract text content, falling back to file upload"
927
+ )
928
+
929
+ logger.debug(
930
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
931
+ )
932
+ payload = DataStorageRequestPayload(
933
+ name=name,
934
+ description=description,
935
+ path=path,
936
+ is_collection=False,
937
+ )
938
+
939
+ logger.debug(
940
+ f"Requesting signed URL with payload: {payload.model_dump(exclude_none=True)}"
941
+ )
942
+
943
+ data_storage_response = self._create_data_storage_entry(payload)
944
+
945
+ if not data_storage_response.signed_url:
946
+ raise DataStorageCreationError("No signed URL returned from server")
947
+
948
+ with tqdm(
949
+ total=file_size,
950
+ unit="B",
951
+ unit_scale=True,
952
+ unit_divisor=1024,
953
+ desc=f"Uploading {file_path.name}",
954
+ miniters=1,
955
+ mininterval=0.1,
956
+ ) as pbar:
957
+ try:
958
+ _upload_file_with_progress(
959
+ data_storage_response.signed_url, file_path, pbar, file_size
960
+ )
961
+ logger.debug("File upload to signed URL completed successfully")
962
+ except Exception as e:
963
+ logger.error(f"Failed to upload file to signed URL: {e}")
964
+ raise
965
+
966
+ logger.debug("Updating data storage status to active")
967
+ status_response = self.client.patch(
968
+ f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
969
+ json={"status": "active"},
970
+ )
971
+ status_response.raise_for_status()
972
+ logger.debug("Data storage status updated successfully")
973
+
974
+ return DataStorageResponse.model_validate(status_response.json())
975
+
976
+ async def _aupload_data_single_file(
977
+ self,
978
+ name: str,
979
+ file_path: Path,
980
+ description: str | None,
981
+ path: str | None = None,
982
+ dataset_id: UUID | None = None,
983
+ ) -> DataStorageResponse:
984
+ """Asynchronously upload a single file."""
985
+ file_size, text_payload = self._prepare_single_file_upload(
986
+ name, file_path, description, path
987
+ )
988
+
989
+ if text_payload:
990
+ logger.debug("Sending file as text content")
991
+ text_payload.dataset_id = dataset_id
992
+ return await self._acreate_data_storage_entry(text_payload)
993
+
994
+ logger.debug(
995
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
996
+ )
997
+ payload = DataStorageRequestPayload(
998
+ name=name,
999
+ description=description,
1000
+ path=path,
1001
+ is_collection=False,
1002
+ dataset_id=dataset_id,
1003
+ )
1004
+
1005
+ data_storage_response = await self._acreate_data_storage_entry(payload)
1006
+
1007
+ if not data_storage_response.signed_url:
1008
+ raise DataStorageCreationError("No signed URL returned from server")
1009
+
1010
+ with tqdm(
1011
+ total=file_size,
1012
+ unit="B",
1013
+ unit_scale=True,
1014
+ unit_divisor=1024,
1015
+ desc=f"Uploading {file_path.name}",
1016
+ miniters=1,
1017
+ mininterval=0.1,
1018
+ ) as pbar:
1019
+ await _aupload_file_with_progress(
1020
+ data_storage_response.signed_url, file_path, pbar, file_size
1021
+ )
1022
+
1023
+ status_response = await self.async_client.patch(
1024
+ f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
1025
+ json={"status": "active"},
1026
+ )
1027
+ status_response.raise_for_status()
1028
+
1029
+ return DataStorageResponse.model_validate(status_response.json())
1030
+
1031
+ def _upload_data_single_file_with_parent(
1032
+ self,
1033
+ name: str,
1034
+ file_path: Path,
1035
+ description: str | None,
1036
+ path: str | None,
1037
+ parent_id: UUID | None,
1038
+ dataset_id: UUID | None = None,
1039
+ ) -> DataStorageResponse:
1040
+ """Upload a single file with a parent ID (sync version)."""
1041
+ file_size, text_payload = self._prepare_single_file_upload(
1042
+ name, file_path, description, path
1043
+ )
1044
+
1045
+ if text_payload:
1046
+ logger.debug("Sending file as text content with parent_id")
1047
+ text_payload.parent_id = parent_id
1048
+ text_payload.dataset_id = dataset_id
1049
+ return self._create_data_storage_entry(text_payload)
1050
+
1051
+ logger.debug(
1052
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1053
+ )
1054
+ payload = DataStorageRequestPayload(
1055
+ name=name,
1056
+ description=description,
1057
+ path=path,
1058
+ is_collection=False,
1059
+ parent_id=parent_id,
1060
+ dataset_id=dataset_id,
1061
+ )
1062
+ data_storage_response = self._create_data_storage_entry(payload)
1063
+
1064
+ if not data_storage_response.signed_url:
1065
+ raise DataStorageCreationError("No signed URL returned from server")
1066
+
1067
+ with tqdm(
1068
+ total=file_size,
1069
+ unit="B",
1070
+ unit_scale=True,
1071
+ unit_divisor=1024,
1072
+ desc=f"Uploading {file_path.name}",
1073
+ miniters=1,
1074
+ mininterval=0.1,
1075
+ leave=False,
1076
+ ) as pbar:
1077
+ _upload_file_with_progress(
1078
+ data_storage_response.signed_url, file_path, pbar, file_size
1079
+ )
1080
+
1081
+ status_response = self.client.patch(
1082
+ f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
1083
+ json={"status": "active"},
1084
+ )
1085
+ status_response.raise_for_status()
1086
+
1087
+ return DataStorageResponse.model_validate(status_response.json())
1088
+
1089
+ def _process_file_item(
1090
+ self,
1091
+ item: Path,
1092
+ dir_manifest: DirectoryManifest,
1093
+ current_parent_id: UUID,
1094
+ dataset_id: UUID | None = None,
1095
+ ) -> DataStorageResponse | None:
1096
+ """Process a single file item for upload."""
1097
+ try:
1098
+ manifest_desc = dir_manifest.get_entry_description(item.name)
1099
+ file_description = manifest_desc or f"File: {item.name}"
1100
+
1101
+ logger.debug(
1102
+ f"Processing file {item.name} with description: '{file_description}'"
1103
+ )
1104
+
1105
+ return self._upload_data_single_file_with_parent(
1106
+ name=item.name,
1107
+ file_path=item,
1108
+ description=file_description,
1109
+ path=None,
1110
+ parent_id=current_parent_id,
1111
+ dataset_id=dataset_id,
1112
+ )
1113
+ except Exception as e:
1114
+ logger.error(f"Failed to upload file {item}: {e}")
1115
+ return None
1116
+
1117
+ def _upload_directory_hierarchically(
1118
+ self,
1119
+ name: str,
1120
+ dir_path: Path,
1121
+ description: str | None = None,
1122
+ manifest_filename: str | None = None,
1123
+ parent_id: UUID | None = None,
1124
+ ignore_patterns: list[str] | None = None,
1125
+ ignore_filename: str = ".gitignore",
1126
+ base_dir: Path | None = None,
1127
+ dir_manifest: DirectoryManifest | None = None,
1128
+ dataset_id: UUID | None = None,
1129
+ ) -> list[DataStorageResponse]:
1130
+ """Upload a directory with single dataset and individual file storage entries."""
1131
+ responses = []
1132
+ if parent_id is None:
1133
+ base_dir = dir_path
1134
+ all_ignore_patterns = _collect_ignore_patterns(
1135
+ base_dir, ignore_patterns, ignore_filename
1136
+ )
1137
+
1138
+ payload = DataStorageRequestPayload(
1139
+ name=name,
1140
+ description=description,
1141
+ parent_id=None,
1142
+ dataset_id=None,
1143
+ is_collection=False,
1144
+ )
1145
+
1146
+ dir_response = self._create_data_storage_entry(payload)
1147
+ responses.append(dir_response)
1148
+ current_parent_id = dir_response.data_storage.id
1149
+ current_dataset_id = dir_response.data_storage.dataset_id
1150
+
1151
+ dir_manifest = self._load_directory_manifest(
1152
+ manifest_filename, parent_id, dir_path
1153
+ )
1154
+ else:
1155
+ all_ignore_patterns = ignore_patterns or []
1156
+ current_parent_id = parent_id
1157
+ current_dataset_id = dataset_id
1158
+
1159
+ for item in dir_path.iterdir():
1160
+ if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
1161
+ continue
1162
+
1163
+ if item.is_dir():
1164
+ subdir_manifest = DirectoryManifest()
1165
+ if dir_manifest:
1166
+ entry = dir_manifest.entries.get(item.name)
1167
+ if isinstance(entry, DirectoryManifest):
1168
+ subdir_manifest = entry
1169
+ elif isinstance(entry, ManifestEntry):
1170
+ # Convert single entry to manifest
1171
+ subdir_manifest = DirectoryManifest(entries={item.name: entry})
1172
+
1173
+ subdir_description = subdir_manifest.get_entry_description(item.name)
1174
+ if not subdir_description:
1175
+ subdir_description = self._generate_folder_description_from_files(
1176
+ item, subdir_manifest
1177
+ )
1178
+
1179
+ subdir_payload = DataStorageRequestPayload(
1180
+ name=item.name,
1181
+ description=subdir_description,
1182
+ parent_id=current_parent_id,
1183
+ dataset_id=current_dataset_id,
1184
+ is_collection=False,
1185
+ )
1186
+ subdir_response = self._create_data_storage_entry(subdir_payload)
1187
+ responses.append(subdir_response)
1188
+
1189
+ subdir_responses = self._upload_directory_hierarchically(
1190
+ name=item.name,
1191
+ dir_path=item,
1192
+ description=None,
1193
+ manifest_filename=None,
1194
+ parent_id=subdir_response.data_storage.id,
1195
+ ignore_patterns=all_ignore_patterns,
1196
+ ignore_filename=ignore_filename,
1197
+ base_dir=base_dir,
1198
+ dir_manifest=subdir_manifest,
1199
+ dataset_id=current_dataset_id,
1200
+ )
1201
+ responses.extend(subdir_responses)
1202
+ elif item.is_file():
1203
+ file_response = self._process_file_item(
1204
+ item,
1205
+ dir_manifest or DirectoryManifest(),
1206
+ current_parent_id,
1207
+ current_dataset_id,
1208
+ )
1209
+ if file_response:
1210
+ responses.append(file_response)
1211
+
1212
+ return responses
1213
+
1214
+ def _load_directory_manifest(
1215
+ self,
1216
+ manifest_filename: str | None,
1217
+ parent_id: UUID | None,
1218
+ dir_path: Path,
1219
+ ) -> DirectoryManifest:
1220
+ """Load directory manifest if available."""
1221
+ if manifest_filename and not parent_id:
1222
+ manifest_data = self._load_manifest(Path.cwd(), manifest_filename)
1223
+ dir_name = dir_path.name
1224
+ logger.debug(
1225
+ f"Loaded manifest entries: {list(manifest_data.entries.keys())}"
1226
+ )
1227
+ logger.debug(
1228
+ f"Looking for manifest entry with directory name: '{dir_name}'"
1229
+ )
1230
+
1231
+ entry = manifest_data.entries.get(dir_name)
1232
+ if isinstance(entry, DirectoryManifest):
1233
+ return entry
1234
+ if isinstance(entry, ManifestEntry):
1235
+ return DirectoryManifest(entries={dir_name: entry})
1236
+ logger.debug(
1237
+ f"No manifest entry found for '{dir_name}', available keys: {list(manifest_data.entries.keys())}"
1238
+ )
1239
+ return DirectoryManifest()
1240
+ return DirectoryManifest()
1241
+
1242
+ async def _aupload_data_single_file_with_parent(
1243
+ self,
1244
+ name: str,
1245
+ file_path: Path,
1246
+ description: str | None,
1247
+ path: str | None,
1248
+ parent_id: UUID | None,
1249
+ dataset_id: UUID | None = None,
1250
+ ) -> DataStorageResponse:
1251
+ """Asynchronously upload a single file with a parent ID."""
1252
+ file_size, text_payload = self._prepare_single_file_upload(
1253
+ name, file_path, description, path
1254
+ )
1255
+
1256
+ if text_payload:
1257
+ logger.debug("Sending file as text content with parent_id")
1258
+ text_payload.parent_id = parent_id
1259
+ text_payload.dataset_id = dataset_id
1260
+ return await self._acreate_data_storage_entry(text_payload)
1261
+
1262
+ logger.debug(
1263
+ f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
1264
+ )
1265
+ payload = DataStorageRequestPayload(
1266
+ name=name,
1267
+ description=description,
1268
+ path=path,
1269
+ is_collection=False,
1270
+ parent_id=parent_id,
1271
+ dataset_id=dataset_id,
1272
+ )
1273
+ data_storage_response = await self._acreate_data_storage_entry(payload)
1274
+
1275
+ if not data_storage_response.signed_url:
1276
+ raise DataStorageCreationError("No signed URL returned from server")
1277
+
1278
+ with tqdm(
1279
+ total=file_size,
1280
+ unit="B",
1281
+ unit_scale=True,
1282
+ unit_divisor=1024,
1283
+ desc=f"Uploading {file_path.name}",
1284
+ miniters=1,
1285
+ mininterval=0.1,
1286
+ ) as pbar:
1287
+ await _aupload_file_with_progress(
1288
+ data_storage_response.signed_url, file_path, pbar, file_size
1289
+ )
1290
+
1291
+ status_response = await self.async_client.patch(
1292
+ f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
1293
+ json={"status": "active"},
1294
+ )
1295
+ status_response.raise_for_status()
1296
+
1297
+ return DataStorageResponse.model_validate(status_response.json())
1298
+
1299
+ async def _aprocess_file_item(
1300
+ self,
1301
+ item: Path,
1302
+ dir_manifest: DirectoryManifest,
1303
+ current_parent_id: UUID,
1304
+ dataset_id: UUID | None = None,
1305
+ ) -> DataStorageResponse | None:
1306
+ """Asynchronously process a single file item for upload."""
1307
+ try:
1308
+ manifest_desc = dir_manifest.get_entry_description(item.name)
1309
+ file_description = manifest_desc or f"File: {item.name}"
1310
+
1311
+ logger.debug(
1312
+ f"Processing file {item.name} with description: '{file_description}'"
1313
+ )
1314
+
1315
+ return await self._aupload_data_single_file_with_parent(
1316
+ name=item.name,
1317
+ file_path=item,
1318
+ description=file_description,
1319
+ path=None,
1320
+ parent_id=current_parent_id,
1321
+ dataset_id=dataset_id,
1322
+ )
1323
+ except Exception as e:
1324
+ logger.error(f"Failed to upload file {item}: {e}")
1325
+ return None
1326
+
1327
+ async def _aupload_directory_hierarchically(
1328
+ self,
1329
+ name: str,
1330
+ dir_path: Path,
1331
+ description: str | None = None,
1332
+ manifest_filename: str | None = None,
1333
+ parent_id: UUID | None = None,
1334
+ ignore_patterns: list[str] | None = None,
1335
+ ignore_filename: str = ".gitignore",
1336
+ base_dir: Path | None = None,
1337
+ dir_manifest: DirectoryManifest | None = None,
1338
+ dataset_id: UUID | None = None,
1339
+ ) -> list[DataStorageResponse]:
1340
+ """Upload a directory with single dataset and individual file storage entries (async)."""
1341
+ responses = []
1342
+
1343
+ if parent_id is None:
1344
+ base_dir = dir_path
1345
+ all_ignore_patterns = _collect_ignore_patterns(
1346
+ base_dir, ignore_patterns, ignore_filename
1347
+ )
1348
+
1349
+ payload = DataStorageRequestPayload(
1350
+ name=name,
1351
+ description=description,
1352
+ parent_id=None,
1353
+ dataset_id=None,
1354
+ is_collection=False,
1355
+ )
1356
+
1357
+ dir_response = await self._acreate_data_storage_entry(payload)
1358
+ responses.append(dir_response)
1359
+ current_parent_id = dir_response.data_storage.id
1360
+ current_dataset_id = dir_response.data_storage.dataset_id
1361
+
1362
+ dir_manifest = self._load_directory_manifest(
1363
+ manifest_filename, parent_id, dir_path
1364
+ )
1365
+ else:
1366
+ all_ignore_patterns = ignore_patterns or []
1367
+ current_parent_id = parent_id
1368
+ current_dataset_id = dataset_id
1369
+
1370
+ for item in dir_path.iterdir():
1371
+ if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
1372
+ continue
1373
+
1374
+ if item.is_dir():
1375
+ subdir_manifest = DirectoryManifest()
1376
+ if dir_manifest:
1377
+ entry = dir_manifest.entries.get(item.name)
1378
+ if isinstance(entry, DirectoryManifest):
1379
+ subdir_manifest = entry
1380
+ elif isinstance(entry, ManifestEntry):
1381
+ subdir_manifest = DirectoryManifest(entries={item.name: entry})
1382
+
1383
+ subdir_description = subdir_manifest.get_entry_description(item.name)
1384
+ if not subdir_description:
1385
+ subdir_description = self._generate_folder_description_from_files(
1386
+ item, subdir_manifest
1387
+ )
1388
+
1389
+ subdir_payload = DataStorageRequestPayload(
1390
+ name=item.name,
1391
+ description=subdir_description,
1392
+ parent_id=current_parent_id,
1393
+ dataset_id=current_dataset_id,
1394
+ is_collection=False,
1395
+ )
1396
+ subdir_response = await self._acreate_data_storage_entry(subdir_payload)
1397
+ responses.append(subdir_response)
1398
+
1399
+ subdir_responses = await self._aupload_directory_hierarchically(
1400
+ name=item.name,
1401
+ dir_path=item,
1402
+ description=None,
1403
+ manifest_filename=None,
1404
+ parent_id=subdir_response.data_storage.id,
1405
+ ignore_patterns=all_ignore_patterns,
1406
+ ignore_filename=ignore_filename,
1407
+ base_dir=base_dir,
1408
+ dir_manifest=subdir_manifest,
1409
+ dataset_id=current_dataset_id,
1410
+ )
1411
+ responses.extend(subdir_responses)
1412
+ elif item.is_file():
1413
+ file_response = await self._aprocess_file_item(
1414
+ item,
1415
+ dir_manifest or DirectoryManifest(),
1416
+ current_parent_id,
1417
+ current_dataset_id,
1418
+ )
1419
+ if file_response:
1420
+ responses.append(file_response)
1421
+
1422
+ return responses
1423
+
1424
+ @property
1425
+ def client(self) -> Client:
1426
+ raise NotImplementedError("client property must be implemented by subclass")
1427
+
1428
+ @property
1429
+ def async_client(self) -> AsyncClient:
1430
+ raise NotImplementedError(
1431
+ "async_client property must be implemented by subclass"
1432
+ )
1433
+
1434
+ @retry(
1435
+ stop=stop_after_attempt(3),
1436
+ wait=wait_exponential(multiplier=1, max=10),
1437
+ retry=retry_if_connection_error,
1438
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1439
+ )
1440
+ def store_text_content(
1441
+ self,
1442
+ name: str,
1443
+ content: str,
1444
+ description: str | None = None,
1445
+ path: str | None = None,
1446
+ ) -> DataStorageResponse:
1447
+ """Store content as a string in the data storage system.
1448
+
1449
+ Args:
1450
+ name: Name of the data storage entry
1451
+ content: Content to store as a string
1452
+ description: Optional description of the data storage entry
1453
+ path: Optional path for the data storage entry
1454
+
1455
+ Returns:
1456
+ DataStorageResponse containing the created data storage entry and storage locations
1457
+
1458
+ Raises:
1459
+ DataStorageCreationError: If there's an error creating the data storage entry
1460
+ """
1461
+ try:
1462
+ payload = DataStorageRequestPayload(
1463
+ name=name,
1464
+ content=content,
1465
+ description=description,
1466
+ path=path,
1467
+ )
1468
+ return self._create_data_storage_entry(payload)
1469
+ except HTTPStatusError as e:
1470
+ self._handle_http_errors(e)
1471
+ except Exception as e:
1472
+ raise DataStorageCreationError(
1473
+ f"An unexpected error occurred: {e!r}"
1474
+ ) from e
1475
+
1476
+ @retry(
1477
+ stop=stop_after_attempt(3),
1478
+ wait=wait_exponential(multiplier=1, max=10),
1479
+ retry=retry_if_connection_error,
1480
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1481
+ )
1482
+ async def astore_text_content(
1483
+ self,
1484
+ name: str,
1485
+ content: str,
1486
+ description: str | None = None,
1487
+ path: str | None = None,
1488
+ dataset_id: UUID | None = None,
1489
+ ) -> DataStorageResponse:
1490
+ """Asynchronously store content as a string in the data storage system.
1491
+
1492
+ Args:
1493
+ name: Name of the data storage entry
1494
+ content: Content to store as a string
1495
+ description: Optional description of the data storage entry
1496
+ path: Optional path for the data storage entry
1497
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset
1498
+
1499
+ Returns:
1500
+ DataStorageResponse containing the created data storage entry and storage locations
1501
+
1502
+ Raises:
1503
+ DataStorageCreationError: If there's an error creating the data storage entry
1504
+ """
1505
+ try:
1506
+ payload = DataStorageRequestPayload(
1507
+ name=name,
1508
+ content=content,
1509
+ description=description,
1510
+ path=path,
1511
+ dataset_id=dataset_id,
1512
+ )
1513
+ return await self._acreate_data_storage_entry(payload)
1514
+ except HTTPStatusError as e:
1515
+ self._handle_http_errors(e)
1516
+ except Exception as e:
1517
+ raise DataStorageCreationError(
1518
+ f"An unexpected error occurred: {e!r}"
1519
+ ) from e
1520
+
1521
+ @retry(
1522
+ stop=stop_after_attempt(3),
1523
+ wait=wait_exponential(multiplier=1, max=10),
1524
+ retry=retry_if_connection_error,
1525
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1526
+ )
1527
+ def store_file_content(
1528
+ self,
1529
+ name: str,
1530
+ file_path: str | Path,
1531
+ description: str | None = None,
1532
+ path: str | None = None,
1533
+ as_collection: bool = False,
1534
+ manifest_filename: str | None = None,
1535
+ ignore_patterns: list[str] | None = None,
1536
+ ignore_filename: str = ".gitignore",
1537
+ ) -> DataStorageResponse:
1538
+ """Store file or directory content in the data storage system.
1539
+
1540
+ For files: Small text files (< 10MB, supported formats) are sent as text content,
1541
+ larger/binary files are uploaded via signed URL.
1542
+
1543
+ For directories: Zipped as a single file with ignore pattern support and uploaded
1544
+ as a collection.
1545
+
1546
+ Args:
1547
+ name: Name of the data storage entry
1548
+ file_path: Path to file or directory to upload
1549
+ description: Optional description of the data storage entry
1550
+ path: Optional path for the data storage entry
1551
+ as_collection: If true, upload directories as a single zip file collection.
1552
+ manifest_filename: Name of manifest file
1553
+ ignore_patterns: List of patterns to ignore when zipping directories
1554
+ ignore_filename: Name of ignore file to read from directory (default: .gitignore)
1555
+
1556
+ Returns:
1557
+ DataStorageResponse containing the final data storage entry
1558
+
1559
+ Raises:
1560
+ DataStorageCreationError: If there's an error in the process
1561
+ """
1562
+ file_path = self._validate_file_path(file_path)
1563
+
1564
+ try:
1565
+ if file_path.is_dir() and as_collection:
1566
+ return self._upload_data_directory(
1567
+ name, file_path, description, path, ignore_patterns, ignore_filename
1568
+ )
1569
+ if file_path.is_dir() and not as_collection:
1570
+ responses = self._upload_directory_hierarchically(
1571
+ name=name,
1572
+ dir_path=file_path,
1573
+ description=description,
1574
+ manifest_filename=manifest_filename,
1575
+ ignore_patterns=ignore_patterns,
1576
+ ignore_filename=ignore_filename,
1577
+ )
1578
+ if not responses:
1579
+ raise DataStorageCreationError(
1580
+ "No data storage entries were created"
1581
+ )
1582
+ return responses[0]
1583
+ return self._upload_data_single_file(name, file_path, description, path)
1584
+
1585
+ except HTTPStatusError as e:
1586
+ self._handle_http_errors(e)
1587
+ except Exception as e:
1588
+ raise DataStorageCreationError(
1589
+ f"An unexpected error occurred during file upload: {e!r}"
1590
+ ) from e
1591
+
1592
+ @retry(
1593
+ stop=stop_after_attempt(3),
1594
+ wait=wait_exponential(multiplier=1, max=10),
1595
+ retry=retry_if_connection_error,
1596
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1597
+ )
1598
+ async def astore_file_content(
1599
+ self,
1600
+ name: str,
1601
+ file_path: str | Path,
1602
+ description: str | None = None,
1603
+ path: str | None = None,
1604
+ as_collection: bool = False,
1605
+ manifest_filename: str | None = None,
1606
+ ignore_patterns: list[str] | None = None,
1607
+ ignore_filename: str = ".gitignore",
1608
+ dataset_id: UUID | None = None,
1609
+ ) -> DataStorageResponse:
1610
+ """Asynchronously store file or directory content in the data storage system.
1611
+
1612
+ Args:
1613
+ name: Name of the data storage entry.
1614
+ file_path: Path to the file or directory to upload.
1615
+ description: Optional description for the entry.
1616
+ path: Optional GCS path for the entry.
1617
+ as_collection: If uploading a directory, `True` zips it into a single collection,
1618
+ `False` uploads it as a hierarchical structure of individual objects.
1619
+ manifest_filename: Optional manifest file for hierarchical uploads.
1620
+ ignore_patterns: List of patterns to ignore when zipping.
1621
+ ignore_filename: Name of ignore file to read (default: .gitignore).
1622
+ dataset_id: Optional dataset ID to add entry to, or None to create new dataset.
1623
+
1624
+ Returns:
1625
+ The `DataStorageResponse` for the created entry. For hierarchical uploads,
1626
+ this is the response for the root directory entry.
1627
+ """
1628
+ file_path = self._validate_file_path(file_path)
1629
+
1630
+ try:
1631
+ if file_path.is_dir():
1632
+ if as_collection:
1633
+ return await self._aupload_data_directory(
1634
+ name,
1635
+ file_path,
1636
+ description,
1637
+ path,
1638
+ ignore_patterns,
1639
+ ignore_filename,
1640
+ )
1641
+ responses = await self._aupload_directory_hierarchically(
1642
+ name=name,
1643
+ dir_path=file_path,
1644
+ description=description,
1645
+ manifest_filename=manifest_filename,
1646
+ ignore_patterns=ignore_patterns,
1647
+ ignore_filename=ignore_filename,
1648
+ dataset_id=dataset_id,
1649
+ )
1650
+ if not responses:
1651
+ raise DataStorageCreationError(
1652
+ "No data storage entries were created"
1653
+ )
1654
+ return responses[0]
1655
+ return await self._aupload_data_single_file(
1656
+ name, file_path, description, path, dataset_id
1657
+ )
1658
+
1659
+ except HTTPStatusError as e:
1660
+ self._handle_http_errors(e)
1661
+ except Exception as e:
1662
+ raise DataStorageCreationError(
1663
+ f"An unexpected error occurred during async file upload: {e!r}"
1664
+ ) from e
1665
+
1666
+ @retry(
1667
+ stop=stop_after_attempt(3),
1668
+ wait=wait_exponential(multiplier=1, max=10),
1669
+ retry=retry_if_connection_error,
1670
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1671
+ )
1672
+ def register_existing_data_source(
1673
+ self,
1674
+ name: str,
1675
+ existing_location: DataStorageLocationPayload,
1676
+ description: str | None = None,
1677
+ path: str | None = None,
1678
+ ) -> DataStorageResponse:
1679
+ """Store content as a string in the data storage system.
1680
+
1681
+ Args:
1682
+ name: Name of the data storage entry
1683
+ existing_location: Describes the existing data source location to register
1684
+ description: Optional description of the data storage entry
1685
+ path: Optional path for the data storage entry
1686
+
1687
+ Returns:
1688
+ DataStorageResponse containing the created data storage entry and storage locations
1689
+
1690
+ Raises:
1691
+ DataStorageCreationError: If there's an error creating the data storage entry
1692
+ """
1693
+ try:
1694
+ payload = DataStorageRequestPayload(
1695
+ name=name,
1696
+ description=description,
1697
+ path=path,
1698
+ existing_location=existing_location,
1699
+ )
1700
+ response = self.client.post(
1701
+ "/v0.1/data-storage", json=payload.model_dump(exclude_none=True)
1702
+ )
1703
+ response.raise_for_status()
1704
+ return DataStorageResponse.model_validate(response.json())
1705
+ except HTTPStatusError as e:
1706
+ self._handle_http_errors(e)
1707
+ except Exception as e:
1708
+ raise DataStorageCreationError(
1709
+ f"An unexpected error occurred: {e!r}"
1710
+ ) from e
1711
+
1712
+ @retry(
1713
+ stop=stop_after_attempt(3),
1714
+ wait=wait_exponential(multiplier=1, max=10),
1715
+ retry=retry_if_connection_error,
1716
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1717
+ )
1718
+ async def aregister_existing_data_source(
1719
+ self,
1720
+ name: str,
1721
+ existing_location: DataStorageLocationPayload,
1722
+ description: str | None = None,
1723
+ path: str | None = None,
1724
+ ) -> DataStorageResponse:
1725
+ """Store content as a string in the data storage system.
1726
+
1727
+ Args:
1728
+ name: Name of the data storage entry
1729
+ existing_location: Describes the existing data source location to register
1730
+ description: Optional description of the data storage entry
1731
+ path: Optional path for the data storage entry
1732
+
1733
+ Returns:
1734
+ DataStorageResponse containing the created data storage entry and storage locations
1735
+
1736
+ Raises:
1737
+ DataStorageCreationError: If there's an error creating the data storage entry
1738
+ """
1739
+ try:
1740
+ payload = DataStorageRequestPayload(
1741
+ name=name,
1742
+ description=description,
1743
+ path=path,
1744
+ existing_location=existing_location,
1745
+ )
1746
+ response = await self.async_client.post(
1747
+ "/v0.1/data-storage", json=payload.model_dump(exclude_none=True)
1748
+ )
1749
+ response.raise_for_status()
1750
+ return DataStorageResponse.model_validate(response.json())
1751
+ except HTTPStatusError as e:
1752
+ self._handle_http_errors(e)
1753
+ except Exception as e:
1754
+ raise DataStorageCreationError(
1755
+ f"An unexpected error occurred: {e!r}"
1756
+ ) from e
1757
+
1758
+ # TODO: EVERYTHING BELOW THIS LINE SHOULD BE MOVED TO FH_TOOLS REPO
1759
+ # =================================================
1760
+ @retry(
1761
+ stop=stop_after_attempt(3),
1762
+ wait=wait_exponential(multiplier=1, max=10),
1763
+ retry=retry_if_connection_error,
1764
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1765
+ )
1766
+ def fetch_data_from_storage(
1767
+ self,
1768
+ data_storage_id: UUID | None = None,
1769
+ ) -> str | Path | None:
1770
+ """Fetch data from the storage system (sync version).
1771
+
1772
+ Args:
1773
+ data_storage_id: ID of the data storage entry to fetch
1774
+
1775
+ Returns:
1776
+ For PG_TABLE storage: string content
1777
+ For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
1778
+ None if not found or error occurred
1779
+ """
1780
+ if not data_storage_id:
1781
+ raise DataStorageCreationError(
1782
+ "data_storage_id must be provided at this time"
1783
+ )
1784
+
1785
+ try:
1786
+ response = self.client.get(f"/v0.1/data-storage/{data_storage_id}")
1787
+ response.raise_for_status()
1788
+ result = DataStorageResponse.model_validate(response.json())
1789
+
1790
+ storage_type = result.storage_location.storage_config.storage_type
1791
+
1792
+ if storage_type == "gcs":
1793
+ if not result.signed_url:
1794
+ raise DataStorageCreationError(
1795
+ "No signed URL available for GCS download"
1796
+ )
1797
+
1798
+ return self._download_from_gcs(result.signed_url)
1799
+
1800
+ if storage_type == "raw_content":
1801
+ content = result.data_storage.content
1802
+ if content is None:
1803
+ logger.warning(
1804
+ f"No content found for data storage entry {data_storage_id}"
1805
+ )
1806
+ return None
1807
+ return content
1808
+
1809
+ raise DataStorageCreationError(f"Unsupported storage type: {storage_type}")
1810
+
1811
+ except HTTPStatusError as e:
1812
+ self._handle_http_errors(e)
1813
+ except Exception as e:
1814
+ raise DataStorageCreationError(
1815
+ f"An unexpected error occurred: {e!r}"
1816
+ ) from e
1817
+
1818
+ @retry(
1819
+ stop=stop_after_attempt(3),
1820
+ wait=wait_exponential(multiplier=1, max=10),
1821
+ retry=retry_if_connection_error,
1822
+ before_sleep=before_sleep_log(logger, logging.WARNING),
1823
+ )
1824
+ async def afetch_data_from_storage(
1825
+ self,
1826
+ data_storage_id: UUID | None = None,
1827
+ ) -> str | Path | None:
1828
+ """Fetch data from the storage system.
1829
+
1830
+ Args:
1831
+ data_storage_id: ID of the data storage entry to fetch
1832
+
1833
+ Returns:
1834
+ For PG_TABLE storage: string content
1835
+ For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
1836
+ None if not found or error occurred
1837
+ """
1838
+ if not data_storage_id:
1839
+ raise DataStorageCreationError(
1840
+ "data_storage_id must be provided at this time"
1841
+ )
1842
+
1843
+ try:
1844
+ response = await self.async_client.get(
1845
+ f"/v0.1/data-storage/{data_storage_id}"
1846
+ )
1847
+ response.raise_for_status()
1848
+ result = DataStorageResponse.model_validate(response.json())
1849
+
1850
+ storage_type = result.storage_location.storage_config.storage_type
1851
+
1852
+ if storage_type == "gcs":
1853
+ if not result.signed_url:
1854
+ raise DataStorageCreationError(
1855
+ "No signed URL available for GCS download"
1856
+ )
1857
+
1858
+ return await self._adownload_from_gcs(result.signed_url)
1859
+
1860
+ if storage_type == "raw_content":
1861
+ content = result.data_storage.content
1862
+ if content is None:
1863
+ logger.warning(
1864
+ f"No content found for data storage entry {data_storage_id}"
1865
+ )
1866
+ return None
1867
+ return content
1868
+
1869
+ raise DataStorageCreationError(f"Unsupported storage type: {storage_type}")
1870
+
1871
+ except HTTPStatusError as e:
1872
+ self._handle_http_errors(e)
1873
+ except Exception as e:
1874
+ raise DataStorageCreationError(
1875
+ f"An unexpected error occurred: {e!r}"
1876
+ ) from e