futurehouse-client 0.3.20.dev266__py3-none-any.whl → 0.3.20.dev411__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- futurehouse_client/__init__.py +8 -0
- futurehouse_client/clients/data_storage_methods.py +1867 -0
- futurehouse_client/clients/rest_client.py +247 -33
- futurehouse_client/models/data_storage_methods.py +333 -0
- futurehouse_client/models/rest.py +15 -0
- futurehouse_client/utils/general.py +34 -0
- futurehouse_client/utils/world_model_tools.py +69 -0
- futurehouse_client/version.py +16 -3
- {futurehouse_client-0.3.20.dev266.dist-info → futurehouse_client-0.3.20.dev411.dist-info}/METADATA +6 -1
- futurehouse_client-0.3.20.dev411.dist-info/RECORD +23 -0
- futurehouse_client-0.3.20.dev266.dist-info/RECORD +0 -20
- {futurehouse_client-0.3.20.dev266.dist-info → futurehouse_client-0.3.20.dev411.dist-info}/WHEEL +0 -0
- {futurehouse_client-0.3.20.dev266.dist-info → futurehouse_client-0.3.20.dev411.dist-info}/licenses/LICENSE +0 -0
- {futurehouse_client-0.3.20.dev266.dist-info → futurehouse_client-0.3.20.dev411.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1867 @@
|
|
1
|
+
import asyncio
|
2
|
+
import fnmatch
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
import shutil
|
6
|
+
import tempfile
|
7
|
+
import zipfile
|
8
|
+
from os import PathLike
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import NoReturn
|
11
|
+
from uuid import UUID
|
12
|
+
|
13
|
+
import aiofiles
|
14
|
+
import aiohttp
|
15
|
+
import requests as requests_lib
|
16
|
+
from google.resumable_media import requests as resumable_requests
|
17
|
+
from httpx import AsyncClient, Client, HTTPStatusError, codes
|
18
|
+
from requests.adapters import HTTPAdapter
|
19
|
+
from tenacity import (
|
20
|
+
before_sleep_log,
|
21
|
+
retry,
|
22
|
+
stop_after_attempt,
|
23
|
+
wait_exponential,
|
24
|
+
)
|
25
|
+
from tqdm import tqdm
|
26
|
+
from urllib3.util.retry import Retry
|
27
|
+
|
28
|
+
from futurehouse_client.models.data_storage_methods import (
|
29
|
+
DataStorageLocationPayload,
|
30
|
+
DataStorageRequestPayload,
|
31
|
+
DataStorageResponse,
|
32
|
+
DirectoryManifest,
|
33
|
+
ManifestEntry,
|
34
|
+
)
|
35
|
+
from futurehouse_client.utils.general import retry_if_connection_error
|
36
|
+
|
37
|
+
# this is only required if they're using a yaml manifest
|
38
|
+
try:
|
39
|
+
import yaml
|
40
|
+
except ImportError:
|
41
|
+
yaml = None # type: ignore[assignment]
|
42
|
+
|
43
|
+
|
44
|
+
logger = logging.getLogger(__name__)
|
45
|
+
|
46
|
+
# TODO: pdf support, unsure what package we want to use
|
47
|
+
SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT = ["txt", "md", "csv", "json", "yaml", "yml"]
|
48
|
+
CHUNK_SIZE = 8 * 1024 * 1024 # 8MB
|
49
|
+
MAX_RETRIES = 3
|
50
|
+
SMALL_FILE_THRESHOLD_BYTES = 10 * 1024 * 1024 # 10MB
|
51
|
+
HTTP_RESUME_INCOMPLETE = 308
|
52
|
+
INITIATE_HEADERS = {
|
53
|
+
"Content-Type": "application/octet-stream",
|
54
|
+
"x-goog-resumable": "start",
|
55
|
+
"Content-Length": "0",
|
56
|
+
}
|
57
|
+
|
58
|
+
|
59
|
+
def _should_ignore_file(
|
60
|
+
file_path: Path | PathLike,
|
61
|
+
base_path: Path | PathLike,
|
62
|
+
ignore_patterns: list[str] | None = None,
|
63
|
+
) -> bool:
|
64
|
+
"""Check if a file should be ignored based on ignore patterns.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
file_path: Path to the file to check
|
68
|
+
base_path: Base directory path
|
69
|
+
ignore_patterns: List of ignore patterns (supports gitignore-style patterns)
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
True if file should be ignored
|
73
|
+
"""
|
74
|
+
if not ignore_patterns:
|
75
|
+
return False
|
76
|
+
|
77
|
+
try:
|
78
|
+
file_path = Path(file_path)
|
79
|
+
base_path = Path(base_path)
|
80
|
+
rel_path = file_path.relative_to(base_path)
|
81
|
+
rel_path_str = str(rel_path)
|
82
|
+
|
83
|
+
for pattern in ignore_patterns:
|
84
|
+
pattern = pattern.strip()
|
85
|
+
if not pattern or pattern.startswith("#"):
|
86
|
+
continue
|
87
|
+
|
88
|
+
is_absolute_match = pattern.startswith("/") and rel_path_str.startswith(
|
89
|
+
pattern[1:]
|
90
|
+
)
|
91
|
+
is_nested_match = "/" in pattern and pattern in rel_path_str
|
92
|
+
is_name_match = fnmatch.fnmatch(file_path.name, pattern)
|
93
|
+
is_part_match = pattern in rel_path.parts
|
94
|
+
|
95
|
+
if is_absolute_match or is_nested_match or is_name_match or is_part_match:
|
96
|
+
return True
|
97
|
+
|
98
|
+
except ValueError:
|
99
|
+
pass
|
100
|
+
|
101
|
+
return False
|
102
|
+
|
103
|
+
|
104
|
+
def _read_ignore_file(dir_path: Path, ignore_filename: str = ".gitignore") -> list[str]:
|
105
|
+
"""Read ignore patterns from a file in the directory.
|
106
|
+
|
107
|
+
Args:
|
108
|
+
dir_path: Directory to look for ignore file
|
109
|
+
ignore_filename: Name of ignore file to read
|
110
|
+
|
111
|
+
Returns:
|
112
|
+
List of ignore patterns
|
113
|
+
"""
|
114
|
+
ignore_file = dir_path / ignore_filename
|
115
|
+
if ignore_file.exists():
|
116
|
+
try:
|
117
|
+
with open(ignore_file, encoding="utf-8") as f:
|
118
|
+
return [line.strip() for line in f]
|
119
|
+
except Exception as e:
|
120
|
+
logger.warning(f"Failed to read {ignore_filename}: {e}")
|
121
|
+
return []
|
122
|
+
else:
|
123
|
+
return []
|
124
|
+
|
125
|
+
|
126
|
+
def _collect_ignore_patterns(
|
127
|
+
dir_path: Path,
|
128
|
+
ignore_patterns: list[str] | None = None,
|
129
|
+
ignore_filename: str = ".gitignore",
|
130
|
+
) -> list[str]:
|
131
|
+
"""Collect all ignore patterns from multiple sources.
|
132
|
+
|
133
|
+
Args:
|
134
|
+
dir_path: Directory to check for ignore files
|
135
|
+
ignore_patterns: Explicit ignore patterns
|
136
|
+
ignore_filename: Name of ignore file to read from directory
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
Combined list of ignore patterns
|
140
|
+
"""
|
141
|
+
all_ignore_patterns = ignore_patterns or []
|
142
|
+
file_patterns = _read_ignore_file(dir_path, ignore_filename)
|
143
|
+
all_ignore_patterns.extend(file_patterns)
|
144
|
+
|
145
|
+
default_ignores = [".git", "__pycache__", "*.pyc", ".DS_Store", "node_modules"]
|
146
|
+
all_ignore_patterns.extend(default_ignores)
|
147
|
+
|
148
|
+
return all_ignore_patterns
|
149
|
+
|
150
|
+
|
151
|
+
def _create_directory_zip(
|
152
|
+
dir_path: Path,
|
153
|
+
zip_path: Path,
|
154
|
+
ignore_patterns: list[str] | None = None,
|
155
|
+
ignore_filename: str = ".gitignore",
|
156
|
+
) -> int:
|
157
|
+
"""Create a zip file from a directory with ignore patterns.
|
158
|
+
|
159
|
+
Args:
|
160
|
+
dir_path: Directory to zip
|
161
|
+
zip_path: Output zip file path
|
162
|
+
ignore_patterns: Explicit ignore patterns
|
163
|
+
ignore_filename: Name of ignore file to read from directory
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
Size of created zip file in bytes
|
167
|
+
"""
|
168
|
+
all_ignore_patterns = _collect_ignore_patterns(
|
169
|
+
dir_path, ignore_patterns, ignore_filename
|
170
|
+
)
|
171
|
+
|
172
|
+
logger.debug(f"Creating zip with ignore patterns: {all_ignore_patterns}")
|
173
|
+
|
174
|
+
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
|
175
|
+
for file_path in dir_path.rglob("*"):
|
176
|
+
if file_path.is_file() and not _should_ignore_file(
|
177
|
+
file_path, dir_path, all_ignore_patterns
|
178
|
+
):
|
179
|
+
arcname = file_path.relative_to(dir_path)
|
180
|
+
zipf.write(file_path, arcname)
|
181
|
+
logger.debug(f"Added to zip: {arcname}")
|
182
|
+
|
183
|
+
zip_size = zip_path.stat().st_size
|
184
|
+
logger.debug(f"Created zip file {zip_path} with size {zip_size:,} bytes")
|
185
|
+
return zip_size
|
186
|
+
|
187
|
+
|
188
|
+
def _should_send_as_text_content(file_path: Path, file_size: int) -> bool:
|
189
|
+
"""Check if a file should be sent as text content instead of file upload.
|
190
|
+
|
191
|
+
Args:
|
192
|
+
file_path: Path to the file
|
193
|
+
file_size: Size of file in bytes
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
True if file should be sent as text content
|
197
|
+
"""
|
198
|
+
# small files can be treated as raw text
|
199
|
+
if file_size >= SMALL_FILE_THRESHOLD_BYTES:
|
200
|
+
return False
|
201
|
+
|
202
|
+
file_extension = file_path.suffix.lower().lstrip(".")
|
203
|
+
return file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT
|
204
|
+
|
205
|
+
|
206
|
+
def _extract_text_from_file(file_path: Path) -> str | None:
|
207
|
+
"""Extract text content from a file.
|
208
|
+
|
209
|
+
Args:
|
210
|
+
file_path: Path to the file
|
211
|
+
|
212
|
+
Returns:
|
213
|
+
Extracted text content or None if extraction failed
|
214
|
+
"""
|
215
|
+
file_extension = file_path.suffix.lower().lstrip(".")
|
216
|
+
|
217
|
+
if file_extension in SUPPORTED_FILE_TYPES_TO_TEXT_CONTENT:
|
218
|
+
try:
|
219
|
+
return file_path.read_text(encoding="utf-8")
|
220
|
+
except Exception as e:
|
221
|
+
logger.warning(f"Failed to extract text from {file_path}: {e}")
|
222
|
+
return None
|
223
|
+
else:
|
224
|
+
return None
|
225
|
+
|
226
|
+
|
227
|
+
def _setup_upload_progress(file_path: Path, file_size: int, progress_bar: tqdm) -> None:
|
228
|
+
"""Common setup for upload progress tracking."""
|
229
|
+
logger.debug(
|
230
|
+
f"Starting resumable upload for file: {file_path} (size: {file_size:,} bytes)"
|
231
|
+
)
|
232
|
+
progress_bar.set_description(f"Uploading {file_path.name}")
|
233
|
+
progress_bar.refresh()
|
234
|
+
|
235
|
+
|
236
|
+
async def _initiate_resumable_session(
|
237
|
+
session: aiohttp.ClientSession, signed_url: str
|
238
|
+
) -> str:
|
239
|
+
"""Initiate resumable upload session and return session URI."""
|
240
|
+
logger.debug("Initiating resumable upload session")
|
241
|
+
async with session.post(signed_url, headers=INITIATE_HEADERS) as initiate_response:
|
242
|
+
if initiate_response.status not in {200, 201}:
|
243
|
+
error_text = await initiate_response.text()
|
244
|
+
logger.error(
|
245
|
+
f"Failed to initiate resumable session: {initiate_response.status}"
|
246
|
+
)
|
247
|
+
logger.error(f"Response: {error_text}")
|
248
|
+
initiate_response.raise_for_status()
|
249
|
+
|
250
|
+
return _validate_session_uri(initiate_response.headers.get("location"))
|
251
|
+
|
252
|
+
|
253
|
+
# TODO: temp
|
254
|
+
def _log_upload_debug(signed_url: str) -> None:
|
255
|
+
"""Common debug logging for uploads."""
|
256
|
+
logger.debug(f"Signed URL: {signed_url[:100]}...")
|
257
|
+
|
258
|
+
|
259
|
+
# TODO: temp
|
260
|
+
def _validate_session_uri(session_uri: str | None) -> str:
|
261
|
+
"""Validate and return session URI or raise exception."""
|
262
|
+
if not session_uri:
|
263
|
+
raise DataStorageError(
|
264
|
+
"No session URI returned from resumable upload initiation"
|
265
|
+
)
|
266
|
+
logger.debug(f"Resumable session initiated. Session URI: {session_uri[:100]}...")
|
267
|
+
return session_uri
|
268
|
+
|
269
|
+
|
270
|
+
async def _upload_chunk_with_retry(
|
271
|
+
session: aiohttp.ClientSession,
|
272
|
+
session_uri: str,
|
273
|
+
chunk_data: bytes,
|
274
|
+
range_start: int,
|
275
|
+
file_size: int,
|
276
|
+
progress_bar: tqdm,
|
277
|
+
) -> int:
|
278
|
+
"""Upload a single chunk with retry logic."""
|
279
|
+
range_end = range_start + len(chunk_data) - 1
|
280
|
+
chunk_headers = {
|
281
|
+
"Content-Type": "application/octet-stream",
|
282
|
+
"Content-Length": str(len(chunk_data)),
|
283
|
+
"Content-Range": f"bytes {range_start}-{range_end}/{file_size}",
|
284
|
+
}
|
285
|
+
|
286
|
+
for attempt in range(MAX_RETRIES):
|
287
|
+
try:
|
288
|
+
async with session.put(
|
289
|
+
session_uri, data=chunk_data, headers=chunk_headers
|
290
|
+
) as chunk_response:
|
291
|
+
if chunk_response.status == HTTP_RESUME_INCOMPLETE:
|
292
|
+
progress_bar.update(len(chunk_data))
|
293
|
+
logger.debug(f"Uploaded chunk: {range_end + 1}/{file_size} bytes")
|
294
|
+
return len(chunk_data)
|
295
|
+
if chunk_response.status in {200, 201}:
|
296
|
+
progress_bar.update(len(chunk_data))
|
297
|
+
logger.debug(
|
298
|
+
f"Upload completed successfully. Final response: {chunk_response.status}"
|
299
|
+
)
|
300
|
+
return len(chunk_data)
|
301
|
+
|
302
|
+
error_text = await chunk_response.text()
|
303
|
+
logger.warning(
|
304
|
+
f"Chunk upload failed (attempt {attempt + 1}/{MAX_RETRIES}): {chunk_response.status}"
|
305
|
+
)
|
306
|
+
logger.warning(f"Response: {error_text}")
|
307
|
+
if attempt == MAX_RETRIES - 1:
|
308
|
+
chunk_response.raise_for_status()
|
309
|
+
|
310
|
+
except (TimeoutError, aiohttp.ClientError) as e:
|
311
|
+
logger.warning(
|
312
|
+
f"Chunk upload error (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
|
313
|
+
)
|
314
|
+
if attempt == MAX_RETRIES - 1:
|
315
|
+
raise
|
316
|
+
await asyncio.sleep(2**attempt)
|
317
|
+
|
318
|
+
return 0
|
319
|
+
|
320
|
+
|
321
|
+
async def _aupload_file_with_progress(
|
322
|
+
signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
|
323
|
+
) -> None:
|
324
|
+
"""Upload a file asynchronously using aiohttp with signed URL initiation."""
|
325
|
+
_setup_upload_progress(file_path, file_size, progress_bar)
|
326
|
+
_log_upload_debug(signed_url)
|
327
|
+
|
328
|
+
try:
|
329
|
+
retry_config = aiohttp.ClientTimeout(
|
330
|
+
total=max(600.0, file_size / (512 * 1024)), connect=30, sock_read=30
|
331
|
+
)
|
332
|
+
connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
|
333
|
+
|
334
|
+
async with aiohttp.ClientSession(
|
335
|
+
connector=connector, timeout=retry_config
|
336
|
+
) as session:
|
337
|
+
session_uri = await _initiate_resumable_session(session, signed_url)
|
338
|
+
|
339
|
+
async with aiofiles.open(file_path, "rb") as file_obj:
|
340
|
+
bytes_uploaded = 0
|
341
|
+
|
342
|
+
while bytes_uploaded < file_size:
|
343
|
+
remaining = file_size - bytes_uploaded
|
344
|
+
current_chunk_size = min(CHUNK_SIZE, remaining)
|
345
|
+
chunk_data = await file_obj.read(current_chunk_size)
|
346
|
+
|
347
|
+
if not chunk_data:
|
348
|
+
break
|
349
|
+
|
350
|
+
uploaded_bytes = await _upload_chunk_with_retry(
|
351
|
+
session,
|
352
|
+
session_uri,
|
353
|
+
chunk_data,
|
354
|
+
bytes_uploaded,
|
355
|
+
file_size,
|
356
|
+
progress_bar,
|
357
|
+
)
|
358
|
+
bytes_uploaded += uploaded_bytes
|
359
|
+
|
360
|
+
if bytes_uploaded >= file_size:
|
361
|
+
break
|
362
|
+
|
363
|
+
logger.debug("Upload completed successfully")
|
364
|
+
|
365
|
+
except Exception as e:
|
366
|
+
logger.error(f"Async resumable upload error: {type(e).__name__}: {e}")
|
367
|
+
raise
|
368
|
+
|
369
|
+
|
370
|
+
def _upload_file_with_progress(
|
371
|
+
signed_url: str, file_path: Path, progress_bar: tqdm, file_size: int
|
372
|
+
) -> None:
|
373
|
+
"""Upload a file synchronously using google.resumable_media with signed URL initiation."""
|
374
|
+
_setup_upload_progress(file_path, file_size, progress_bar)
|
375
|
+
_log_upload_debug(signed_url)
|
376
|
+
|
377
|
+
try:
|
378
|
+
session = requests_lib.Session()
|
379
|
+
retry_strategy = Retry(
|
380
|
+
total=MAX_RETRIES,
|
381
|
+
backoff_factor=2,
|
382
|
+
status_forcelist=[429, 500, 502, 503, 504],
|
383
|
+
allowed_methods=["POST", "PUT", "PATCH"],
|
384
|
+
)
|
385
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
386
|
+
session.mount("http://", adapter)
|
387
|
+
session.mount("https://", adapter)
|
388
|
+
|
389
|
+
logger.debug("Initiating resumable upload session")
|
390
|
+
initiate_response = session.post(
|
391
|
+
signed_url, headers=INITIATE_HEADERS, timeout=30
|
392
|
+
)
|
393
|
+
|
394
|
+
if initiate_response.status_code not in {200, 201}:
|
395
|
+
logger.error(
|
396
|
+
f"Failed to initiate resumable session: {initiate_response.status_code}"
|
397
|
+
)
|
398
|
+
logger.error(f"Response: {initiate_response.text}")
|
399
|
+
initiate_response.raise_for_status()
|
400
|
+
|
401
|
+
session_uri = _validate_session_uri(initiate_response.headers.get("location"))
|
402
|
+
|
403
|
+
with open(file_path, "rb") as file_obj:
|
404
|
+
upload = resumable_requests.ResumableUpload(
|
405
|
+
upload_url=signed_url, chunk_size=CHUNK_SIZE
|
406
|
+
)
|
407
|
+
|
408
|
+
upload._resumable_url = session_uri
|
409
|
+
upload._stream = file_obj
|
410
|
+
upload._total_bytes = file_size
|
411
|
+
|
412
|
+
wrapped_file = ProgressWrapper(file_obj, progress_bar)
|
413
|
+
upload._stream = wrapped_file
|
414
|
+
|
415
|
+
while not upload.finished:
|
416
|
+
try:
|
417
|
+
upload.transmit_next_chunk(session)
|
418
|
+
except Exception as e:
|
419
|
+
logger.error(f"Chunk upload failed: {e}")
|
420
|
+
raise
|
421
|
+
|
422
|
+
logger.debug("Upload completed successfully using resumable_media library")
|
423
|
+
|
424
|
+
except Exception as e:
|
425
|
+
logger.error(f"Sync resumable upload error: {type(e).__name__}: {e}")
|
426
|
+
raise
|
427
|
+
|
428
|
+
|
429
|
+
class RestClientError(Exception):
|
430
|
+
"""Base exception for REST client errors."""
|
431
|
+
|
432
|
+
|
433
|
+
class DataStorageError(RestClientError):
|
434
|
+
"""Base exception for data storage operations."""
|
435
|
+
|
436
|
+
|
437
|
+
class DataStorageCreationError(DataStorageError):
|
438
|
+
"""Raised when there's an error creating a data storage entry."""
|
439
|
+
|
440
|
+
|
441
|
+
class ProgressWrapper:
|
442
|
+
"""Common progress wrapper for file uploads."""
|
443
|
+
|
444
|
+
def __init__(self, file_obj, progress_bar):
|
445
|
+
self.file_obj = file_obj
|
446
|
+
self.progress_bar = progress_bar
|
447
|
+
self.bytes_read = 0
|
448
|
+
|
449
|
+
def read(self, size=-1):
|
450
|
+
data = self.file_obj.read(size)
|
451
|
+
if data:
|
452
|
+
self.bytes_read += len(data)
|
453
|
+
current_pos = self.file_obj.tell()
|
454
|
+
if current_pos > self.progress_bar.n:
|
455
|
+
self.progress_bar.update(current_pos - self.progress_bar.n)
|
456
|
+
return data
|
457
|
+
|
458
|
+
def seek(self, offset, whence=0):
|
459
|
+
return self.file_obj.seek(offset, whence)
|
460
|
+
|
461
|
+
def tell(self):
|
462
|
+
return self.file_obj.tell()
|
463
|
+
|
464
|
+
|
465
|
+
class DataStorageMethods:
|
466
|
+
"""Data storage methods for RestClient.
|
467
|
+
|
468
|
+
This class contains methods for interacting with the data storage API endpoints.
|
469
|
+
"""
|
470
|
+
|
471
|
+
# needed for mypy `NoReturn`
|
472
|
+
def _handle_http_errors(self, e: HTTPStatusError) -> NoReturn:
|
473
|
+
"""Handle common HTTP errors for data storage operations."""
|
474
|
+
if e.response.status_code == codes.FORBIDDEN:
|
475
|
+
raise DataStorageCreationError(
|
476
|
+
"Not authorized to create data storage entries"
|
477
|
+
) from e
|
478
|
+
if e.response.status_code == codes.UNPROCESSABLE_ENTITY:
|
479
|
+
raise DataStorageCreationError(
|
480
|
+
f"Invalid request payload: {e.response.text}"
|
481
|
+
) from e
|
482
|
+
raise DataStorageCreationError(
|
483
|
+
f"Error creating data storage entry: {e.response.status_code} - {e.response.text}"
|
484
|
+
) from e
|
485
|
+
|
486
|
+
def _validate_file_path(self, file_path: str | Path) -> Path:
|
487
|
+
"""Validate file path exists and return Path object."""
|
488
|
+
file_path = Path(file_path)
|
489
|
+
if not file_path.exists():
|
490
|
+
raise DataStorageCreationError(f"File or directory not found: {file_path}")
|
491
|
+
return file_path
|
492
|
+
|
493
|
+
def _build_zip_path(self, name: str, path: str | None) -> str:
|
494
|
+
"""Build GCS path for zip file."""
|
495
|
+
zip_filename = name if name.endswith(".zip") else f"{name}.zip"
|
496
|
+
if path:
|
497
|
+
return f"{path.rstrip('/')}/{zip_filename}"
|
498
|
+
return zip_filename
|
499
|
+
|
500
|
+
# TODO: methods in here need to be moved to fh tools
|
501
|
+
# =====================================
|
502
|
+
def _is_zip_file(self, file_path: Path) -> bool:
|
503
|
+
"""Check if a file is a zip file by checking its magic bytes."""
|
504
|
+
try:
|
505
|
+
with open(file_path, "rb") as f:
|
506
|
+
magic = f.read(2)
|
507
|
+
return magic == b"PK"
|
508
|
+
except Exception:
|
509
|
+
return False
|
510
|
+
|
511
|
+
def _extract_zip_file(self, zip_path: Path, extract_to: Path) -> Path:
|
512
|
+
"""Extract a zip file and return the path to the extracted content.
|
513
|
+
|
514
|
+
Args:
|
515
|
+
zip_path: Path to the zip file
|
516
|
+
extract_to: Directory to extract to
|
517
|
+
|
518
|
+
Returns:
|
519
|
+
Path to the extracted content (directory or single file)
|
520
|
+
"""
|
521
|
+
extract_dir = extract_to / "extracted"
|
522
|
+
extract_dir.mkdir(exist_ok=True)
|
523
|
+
|
524
|
+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
525
|
+
zip_ref.extractall(extract_dir)
|
526
|
+
extracted_items = list(extract_dir.iterdir())
|
527
|
+
|
528
|
+
if len(extracted_items) == 1:
|
529
|
+
return extracted_items[0]
|
530
|
+
return extract_dir
|
531
|
+
|
532
|
+
async def _adownload_from_gcs(self, signed_url: str) -> Path:
|
533
|
+
"""Download file from GCS using signed URL and handle unzipping if needed.
|
534
|
+
|
535
|
+
Args:
|
536
|
+
signed_url: The signed URL to download from
|
537
|
+
|
538
|
+
Returns:
|
539
|
+
Path to the downloaded file (or unzipped directory if it was a zip)
|
540
|
+
"""
|
541
|
+
try:
|
542
|
+
with tempfile.TemporaryDirectory() as temp_dir_str:
|
543
|
+
temp_dir = Path(temp_dir_str)
|
544
|
+
temp_file = temp_dir / "downloaded_file"
|
545
|
+
|
546
|
+
async with self.async_client.stream("GET", signed_url) as response:
|
547
|
+
response.raise_for_status()
|
548
|
+
|
549
|
+
content_disposition = response.headers.get(
|
550
|
+
"content-disposition", ""
|
551
|
+
)
|
552
|
+
filename = "downloaded_file"
|
553
|
+
if "filename=" in content_disposition:
|
554
|
+
filename = content_disposition.split("filename=")[-1].strip('"')
|
555
|
+
|
556
|
+
if filename != "downloaded_file":
|
557
|
+
temp_file = temp_dir / filename
|
558
|
+
|
559
|
+
async with aiofiles.open(temp_file, "wb") as f:
|
560
|
+
async for chunk in response.aiter_bytes(chunk_size=8192):
|
561
|
+
await f.write(chunk)
|
562
|
+
|
563
|
+
logger.debug(
|
564
|
+
f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
|
565
|
+
)
|
566
|
+
|
567
|
+
if self._is_zip_file(temp_file):
|
568
|
+
logger.debug(f"File {temp_file} is a zip file, extracting...")
|
569
|
+
extracted_path = self._extract_zip_file(temp_file, temp_dir)
|
570
|
+
|
571
|
+
final_temp_dir = Path(tempfile.mkdtemp())
|
572
|
+
final_path = final_temp_dir / extracted_path.name
|
573
|
+
|
574
|
+
if extracted_path.is_dir():
|
575
|
+
shutil.copytree(extracted_path, final_path)
|
576
|
+
else:
|
577
|
+
shutil.copy2(extracted_path, final_path)
|
578
|
+
|
579
|
+
return final_path
|
580
|
+
final_temp_dir = Path(tempfile.mkdtemp())
|
581
|
+
final_file = final_temp_dir / temp_file.name
|
582
|
+
shutil.copy2(temp_file, final_file)
|
583
|
+
return final_file
|
584
|
+
|
585
|
+
except Exception as e:
|
586
|
+
raise DataStorageCreationError(f"Failed to download from GCS: {e}") from e
|
587
|
+
|
588
|
+
def _download_from_gcs(self, signed_url: str) -> Path:
|
589
|
+
"""Download file from GCS using signed URL and handle unzipping if needed (sync version).
|
590
|
+
|
591
|
+
Args:
|
592
|
+
signed_url: The signed URL to download from
|
593
|
+
|
594
|
+
Returns:
|
595
|
+
Path to the downloaded file (or unzipped directory if it was a zip)
|
596
|
+
"""
|
597
|
+
try:
|
598
|
+
with tempfile.TemporaryDirectory() as temp_dir_str:
|
599
|
+
temp_dir = Path(temp_dir_str)
|
600
|
+
temp_file = temp_dir / "downloaded_file"
|
601
|
+
|
602
|
+
with requests_lib.get(signed_url, stream=True, timeout=30) as response:
|
603
|
+
response.raise_for_status()
|
604
|
+
|
605
|
+
content_disposition = response.headers.get(
|
606
|
+
"content-disposition", ""
|
607
|
+
)
|
608
|
+
filename = "downloaded_file"
|
609
|
+
if "filename=" in content_disposition:
|
610
|
+
filename = content_disposition.split("filename=")[-1].strip('"')
|
611
|
+
|
612
|
+
if filename != "downloaded_file":
|
613
|
+
temp_file = temp_dir / filename
|
614
|
+
|
615
|
+
with open(temp_file, "wb") as f:
|
616
|
+
for chunk in response.iter_content(chunk_size=8192):
|
617
|
+
f.write(chunk)
|
618
|
+
|
619
|
+
logger.debug(
|
620
|
+
f"Downloaded file to {temp_file} (size: {temp_file.stat().st_size:,} bytes)"
|
621
|
+
)
|
622
|
+
|
623
|
+
if self._is_zip_file(temp_file):
|
624
|
+
logger.debug(f"File {temp_file} is a zip file, extracting...")
|
625
|
+
extracted_path = self._extract_zip_file(temp_file, temp_dir)
|
626
|
+
|
627
|
+
final_temp_dir = Path(tempfile.mkdtemp())
|
628
|
+
final_path = final_temp_dir / extracted_path.name
|
629
|
+
|
630
|
+
if extracted_path.is_dir():
|
631
|
+
shutil.copytree(extracted_path, final_path)
|
632
|
+
else:
|
633
|
+
shutil.copy2(extracted_path, final_path)
|
634
|
+
|
635
|
+
return final_path
|
636
|
+
final_temp_dir = Path(tempfile.mkdtemp())
|
637
|
+
final_file = final_temp_dir / temp_file.name
|
638
|
+
shutil.copy2(temp_file, final_file)
|
639
|
+
return final_file
|
640
|
+
|
641
|
+
except Exception as e:
|
642
|
+
raise DataStorageCreationError(f"Failed to download from GCS: {e}") from e
|
643
|
+
|
644
|
+
# =====================================
|
645
|
+
|
646
|
+
def _prepare_single_file_upload(
|
647
|
+
self, name: str, file_path: Path, description: str | None, path: str | None
|
648
|
+
) -> tuple[int, DataStorageRequestPayload | None]:
|
649
|
+
"""Prepare single file for upload, return file size and payload if text content."""
|
650
|
+
file_size = file_path.stat().st_size
|
651
|
+
logger.debug(
|
652
|
+
f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
|
653
|
+
)
|
654
|
+
|
655
|
+
if _should_send_as_text_content(file_path, file_size):
|
656
|
+
logger.debug(
|
657
|
+
f"Small text file ({file_size:,} bytes) - sending as text content"
|
658
|
+
)
|
659
|
+
text_content = _extract_text_from_file(file_path)
|
660
|
+
if text_content is not None:
|
661
|
+
return file_size, DataStorageRequestPayload(
|
662
|
+
name=name,
|
663
|
+
description=description,
|
664
|
+
content=text_content,
|
665
|
+
path=path,
|
666
|
+
is_collection=False,
|
667
|
+
)
|
668
|
+
logger.warning(
|
669
|
+
"Could not extract text content, falling back to file upload"
|
670
|
+
)
|
671
|
+
|
672
|
+
return file_size, None
|
673
|
+
|
674
|
+
def _create_data_storage_entry(
|
675
|
+
self, payload: DataStorageRequestPayload
|
676
|
+
) -> DataStorageResponse:
|
677
|
+
"""Create data storage entry via API (sync version)."""
|
678
|
+
response = self.client.post(
|
679
|
+
"/v0.1/data-storage",
|
680
|
+
json=payload.model_dump(mode="json", exclude_none=True),
|
681
|
+
)
|
682
|
+
response.raise_for_status()
|
683
|
+
return DataStorageResponse.model_validate(response.json())
|
684
|
+
|
685
|
+
async def _acreate_data_storage_entry(
|
686
|
+
self, payload: DataStorageRequestPayload
|
687
|
+
) -> DataStorageResponse:
|
688
|
+
"""Create data storage entry via API (async version)."""
|
689
|
+
response = await self.async_client.post(
|
690
|
+
"/v0.1/data-storage",
|
691
|
+
json=payload.model_dump(mode="json", exclude_none=True),
|
692
|
+
)
|
693
|
+
response.raise_for_status()
|
694
|
+
return DataStorageResponse.model_validate(response.json())
|
695
|
+
|
696
|
+
def _generate_folder_description_from_files(
|
697
|
+
self, dir_path: Path, manifest: DirectoryManifest
|
698
|
+
) -> str:
|
699
|
+
"""Generate folder description by concatenating descriptions of top-level files."""
|
700
|
+
descriptions = []
|
701
|
+
|
702
|
+
# Get top-level files only (not recursive)
|
703
|
+
for item in dir_path.iterdir():
|
704
|
+
if item.is_file():
|
705
|
+
# Try to get description from manifest first
|
706
|
+
file_desc = manifest.get_entry_description(item.name)
|
707
|
+
|
708
|
+
if file_desc:
|
709
|
+
descriptions.append(f"{item.name}: {file_desc}")
|
710
|
+
else:
|
711
|
+
descriptions.append(item.name)
|
712
|
+
|
713
|
+
if descriptions:
|
714
|
+
return f"Directory containing: {', '.join(descriptions)}"
|
715
|
+
return f"Directory: {dir_path.name}"
|
716
|
+
|
717
|
+
def _load_manifest(
|
718
|
+
self, dir_path: Path, manifest_filename: str | None
|
719
|
+
) -> DirectoryManifest:
|
720
|
+
"""Load and parse a manifest file (JSON or YAML) into a structured model."""
|
721
|
+
if not manifest_filename:
|
722
|
+
return DirectoryManifest()
|
723
|
+
|
724
|
+
manifest_path = dir_path / manifest_filename
|
725
|
+
if not manifest_path.exists():
|
726
|
+
logger.error(f"Manifest file not found at {manifest_path}")
|
727
|
+
raise DataStorageCreationError(
|
728
|
+
f"Manifest file {manifest_filename} not found in directory {dir_path}. Ensure the manifest exists and is correctly named, or do not pass it as an argument."
|
729
|
+
)
|
730
|
+
|
731
|
+
try:
|
732
|
+
with open(manifest_path, encoding="utf-8") as f:
|
733
|
+
data = {}
|
734
|
+
if manifest_filename.lower().endswith(".json"):
|
735
|
+
data = json.load(f)
|
736
|
+
elif manifest_filename.lower().endswith((".yaml", ".yml")):
|
737
|
+
if yaml is None:
|
738
|
+
raise ImportError(
|
739
|
+
"pyyaml is required to parse .yaml manifest files. "
|
740
|
+
"Please install it with `pip install pyyaml`."
|
741
|
+
)
|
742
|
+
data = yaml.safe_load(f)
|
743
|
+
else:
|
744
|
+
logger.warning(
|
745
|
+
f"Unsupported manifest file extension: {manifest_filename}"
|
746
|
+
)
|
747
|
+
return DirectoryManifest()
|
748
|
+
|
749
|
+
return DirectoryManifest.from_dict(data or {})
|
750
|
+
|
751
|
+
except Exception as e:
|
752
|
+
logger.warning(f"Failed to load manifest {manifest_filename}: {e}")
|
753
|
+
|
754
|
+
return DirectoryManifest()
|
755
|
+
|
756
|
+
def _upload_data_directory(
|
757
|
+
self,
|
758
|
+
name: str,
|
759
|
+
dir_path: Path,
|
760
|
+
description: str | None,
|
761
|
+
path: str | None = None,
|
762
|
+
ignore_patterns: list[str] | None = None,
|
763
|
+
ignore_filename: str = ".gitignore",
|
764
|
+
) -> DataStorageResponse:
|
765
|
+
"""Upload a directory as a single zip file collection.
|
766
|
+
|
767
|
+
Args:
|
768
|
+
name: Name for the directory collection
|
769
|
+
dir_path: Path to directory to zip and upload
|
770
|
+
description: Description for the collection
|
771
|
+
path: Optional GCS path for the zip file
|
772
|
+
ignore_patterns: List of patterns to ignore when zipping
|
773
|
+
ignore_filename: Name of ignore file to read from directory
|
774
|
+
|
775
|
+
Returns:
|
776
|
+
DataStorageResponse for the uploaded zip file
|
777
|
+
"""
|
778
|
+
logger.debug(f"Uploading directory as zip: {dir_path}")
|
779
|
+
|
780
|
+
with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
|
781
|
+
temp_zip_path = Path(temp_file.name)
|
782
|
+
|
783
|
+
zip_size = _create_directory_zip(
|
784
|
+
dir_path, temp_zip_path, ignore_patterns, ignore_filename
|
785
|
+
)
|
786
|
+
|
787
|
+
zip_gcs_path = self._build_zip_path(name, path)
|
788
|
+
payload = DataStorageRequestPayload(
|
789
|
+
name=name,
|
790
|
+
description=description,
|
791
|
+
path=zip_gcs_path,
|
792
|
+
is_collection=True,
|
793
|
+
)
|
794
|
+
|
795
|
+
logger.debug(
|
796
|
+
f"Creating data storage entry for zip: {payload.model_dump(exclude_none=True)}"
|
797
|
+
)
|
798
|
+
data_storage_response = self._create_data_storage_entry(payload)
|
799
|
+
|
800
|
+
if not data_storage_response.signed_url:
|
801
|
+
raise DataStorageCreationError("No signed URL returned for zip upload")
|
802
|
+
|
803
|
+
with tqdm(
|
804
|
+
total=zip_size,
|
805
|
+
unit="B",
|
806
|
+
unit_scale=True,
|
807
|
+
unit_divisor=1024,
|
808
|
+
desc=f"Uploading {dir_path.name} (zipped)",
|
809
|
+
miniters=1,
|
810
|
+
mininterval=0.1,
|
811
|
+
) as pbar:
|
812
|
+
_upload_file_with_progress(
|
813
|
+
data_storage_response.signed_url, temp_zip_path, pbar, zip_size
|
814
|
+
)
|
815
|
+
|
816
|
+
status_response = self.client.patch(
|
817
|
+
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
818
|
+
json={"status": "active"},
|
819
|
+
)
|
820
|
+
status_response.raise_for_status()
|
821
|
+
|
822
|
+
logger.debug(
|
823
|
+
f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
|
824
|
+
)
|
825
|
+
return DataStorageResponse.model_validate(status_response.json())
|
826
|
+
|
827
|
+
async def _aupload_data_directory(
|
828
|
+
self,
|
829
|
+
name: str,
|
830
|
+
dir_path: Path,
|
831
|
+
description: str | None,
|
832
|
+
path: str | None = None,
|
833
|
+
ignore_patterns: list[str] | None = None,
|
834
|
+
ignore_filename: str = ".gitignore",
|
835
|
+
) -> DataStorageResponse:
|
836
|
+
"""Asynchronously upload a directory as a single zip file.
|
837
|
+
|
838
|
+
Args:
|
839
|
+
name: Name for the directory collection
|
840
|
+
dir_path: Path to directory to zip and upload
|
841
|
+
description: Description for the collection
|
842
|
+
path: Optional GCS path for the zip file
|
843
|
+
ignore_patterns: List of patterns to ignore when zipping
|
844
|
+
ignore_filename: Name of ignore file to read from directory
|
845
|
+
|
846
|
+
Returns:
|
847
|
+
DataStorageResponse for the uploaded zip file
|
848
|
+
"""
|
849
|
+
logger.debug(f"Async uploading directory as zip: {dir_path}")
|
850
|
+
|
851
|
+
with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file:
|
852
|
+
temp_zip_path = Path(temp_file.name)
|
853
|
+
|
854
|
+
zip_size = _create_directory_zip(
|
855
|
+
dir_path, temp_zip_path, ignore_patterns, ignore_filename
|
856
|
+
)
|
857
|
+
|
858
|
+
zip_gcs_path = self._build_zip_path(name, path)
|
859
|
+
payload = DataStorageRequestPayload(
|
860
|
+
name=name,
|
861
|
+
description=description,
|
862
|
+
path=zip_gcs_path,
|
863
|
+
is_collection=True,
|
864
|
+
)
|
865
|
+
|
866
|
+
data_storage_response = await self._acreate_data_storage_entry(payload)
|
867
|
+
|
868
|
+
if not data_storage_response.signed_url:
|
869
|
+
raise DataStorageCreationError("No signed URL returned for zip upload")
|
870
|
+
|
871
|
+
with tqdm(
|
872
|
+
total=zip_size,
|
873
|
+
unit="B",
|
874
|
+
unit_scale=True,
|
875
|
+
unit_divisor=1024,
|
876
|
+
desc=f"Uploading {dir_path.name} (zipped)",
|
877
|
+
miniters=1,
|
878
|
+
mininterval=0.1,
|
879
|
+
) as pbar:
|
880
|
+
await _aupload_file_with_progress(
|
881
|
+
data_storage_response.signed_url, temp_zip_path, pbar, zip_size
|
882
|
+
)
|
883
|
+
|
884
|
+
status_response = await self.async_client.patch(
|
885
|
+
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
886
|
+
json={"status": "active"},
|
887
|
+
)
|
888
|
+
status_response.raise_for_status()
|
889
|
+
|
890
|
+
logger.debug(
|
891
|
+
f"Successfully uploaded directory {dir_path.name} as zip ({zip_size:,} bytes)"
|
892
|
+
)
|
893
|
+
return DataStorageResponse.model_validate(status_response.json())
|
894
|
+
|
895
|
+
def _upload_data_single_file(
|
896
|
+
self,
|
897
|
+
name: str,
|
898
|
+
file_path: Path,
|
899
|
+
description: str | None,
|
900
|
+
path: str | None = None,
|
901
|
+
) -> DataStorageResponse:
|
902
|
+
"""Upload a single file."""
|
903
|
+
file_size = file_path.stat().st_size
|
904
|
+
logger.debug(
|
905
|
+
f"Starting upload of single file: {file_path} (size: {file_size:,} bytes)"
|
906
|
+
)
|
907
|
+
|
908
|
+
if _should_send_as_text_content(file_path, file_size):
|
909
|
+
logger.debug(
|
910
|
+
f"Small text file ({file_size:,} bytes) - sending as text content"
|
911
|
+
)
|
912
|
+
|
913
|
+
text_content = _extract_text_from_file(file_path)
|
914
|
+
if text_content is not None:
|
915
|
+
payload = DataStorageRequestPayload(
|
916
|
+
name=name,
|
917
|
+
description=description,
|
918
|
+
content=text_content,
|
919
|
+
path=path,
|
920
|
+
is_collection=False,
|
921
|
+
)
|
922
|
+
|
923
|
+
logger.debug("Sending file as text content")
|
924
|
+
return self._create_data_storage_entry(payload)
|
925
|
+
logger.warning(
|
926
|
+
"Could not extract text content, falling back to file upload"
|
927
|
+
)
|
928
|
+
|
929
|
+
logger.debug(
|
930
|
+
f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
|
931
|
+
)
|
932
|
+
payload = DataStorageRequestPayload(
|
933
|
+
name=name,
|
934
|
+
description=description,
|
935
|
+
path=path,
|
936
|
+
is_collection=False,
|
937
|
+
)
|
938
|
+
|
939
|
+
logger.debug(
|
940
|
+
f"Requesting signed URL with payload: {payload.model_dump(exclude_none=True)}"
|
941
|
+
)
|
942
|
+
|
943
|
+
data_storage_response = self._create_data_storage_entry(payload)
|
944
|
+
|
945
|
+
if not data_storage_response.signed_url:
|
946
|
+
raise DataStorageCreationError("No signed URL returned from server")
|
947
|
+
|
948
|
+
with tqdm(
|
949
|
+
total=file_size,
|
950
|
+
unit="B",
|
951
|
+
unit_scale=True,
|
952
|
+
unit_divisor=1024,
|
953
|
+
desc=f"Uploading {file_path.name}",
|
954
|
+
miniters=1,
|
955
|
+
mininterval=0.1,
|
956
|
+
) as pbar:
|
957
|
+
try:
|
958
|
+
_upload_file_with_progress(
|
959
|
+
data_storage_response.signed_url, file_path, pbar, file_size
|
960
|
+
)
|
961
|
+
logger.debug("File upload to signed URL completed successfully")
|
962
|
+
except Exception as e:
|
963
|
+
logger.error(f"Failed to upload file to signed URL: {e}")
|
964
|
+
raise
|
965
|
+
|
966
|
+
logger.debug("Updating data storage status to active")
|
967
|
+
status_response = self.client.patch(
|
968
|
+
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
969
|
+
json={"status": "active"},
|
970
|
+
)
|
971
|
+
status_response.raise_for_status()
|
972
|
+
logger.debug("Data storage status updated successfully")
|
973
|
+
|
974
|
+
return DataStorageResponse.model_validate(status_response.json())
|
975
|
+
|
976
|
+
async def _aupload_data_single_file(
|
977
|
+
self,
|
978
|
+
name: str,
|
979
|
+
file_path: Path,
|
980
|
+
description: str | None,
|
981
|
+
path: str | None = None,
|
982
|
+
) -> DataStorageResponse:
|
983
|
+
"""Asynchronously upload a single file."""
|
984
|
+
file_size, text_payload = self._prepare_single_file_upload(
|
985
|
+
name, file_path, description, path
|
986
|
+
)
|
987
|
+
|
988
|
+
if text_payload:
|
989
|
+
logger.debug("Sending file as text content")
|
990
|
+
return await self._acreate_data_storage_entry(text_payload)
|
991
|
+
|
992
|
+
logger.debug(
|
993
|
+
f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
|
994
|
+
)
|
995
|
+
payload = DataStorageRequestPayload(
|
996
|
+
name=name,
|
997
|
+
description=description,
|
998
|
+
path=path,
|
999
|
+
is_collection=False,
|
1000
|
+
)
|
1001
|
+
|
1002
|
+
data_storage_response = await self._acreate_data_storage_entry(payload)
|
1003
|
+
|
1004
|
+
if not data_storage_response.signed_url:
|
1005
|
+
raise DataStorageCreationError("No signed URL returned from server")
|
1006
|
+
|
1007
|
+
with tqdm(
|
1008
|
+
total=file_size,
|
1009
|
+
unit="B",
|
1010
|
+
unit_scale=True,
|
1011
|
+
unit_divisor=1024,
|
1012
|
+
desc=f"Uploading {file_path.name}",
|
1013
|
+
miniters=1,
|
1014
|
+
mininterval=0.1,
|
1015
|
+
) as pbar:
|
1016
|
+
await _aupload_file_with_progress(
|
1017
|
+
data_storage_response.signed_url, file_path, pbar, file_size
|
1018
|
+
)
|
1019
|
+
|
1020
|
+
status_response = await self.async_client.patch(
|
1021
|
+
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
1022
|
+
json={"status": "active"},
|
1023
|
+
)
|
1024
|
+
status_response.raise_for_status()
|
1025
|
+
|
1026
|
+
return DataStorageResponse.model_validate(status_response.json())
|
1027
|
+
|
1028
|
+
def _upload_data_single_file_with_parent(
|
1029
|
+
self,
|
1030
|
+
name: str,
|
1031
|
+
file_path: Path,
|
1032
|
+
description: str | None,
|
1033
|
+
path: str | None,
|
1034
|
+
parent_id: UUID | None,
|
1035
|
+
dataset_id: UUID | None = None,
|
1036
|
+
) -> DataStorageResponse:
|
1037
|
+
"""Upload a single file with a parent ID (sync version)."""
|
1038
|
+
file_size, text_payload = self._prepare_single_file_upload(
|
1039
|
+
name, file_path, description, path
|
1040
|
+
)
|
1041
|
+
|
1042
|
+
if text_payload:
|
1043
|
+
logger.debug("Sending file as text content with parent_id")
|
1044
|
+
text_payload.parent_id = parent_id
|
1045
|
+
text_payload.dataset_id = dataset_id
|
1046
|
+
return self._create_data_storage_entry(text_payload)
|
1047
|
+
|
1048
|
+
logger.debug(
|
1049
|
+
f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
|
1050
|
+
)
|
1051
|
+
payload = DataStorageRequestPayload(
|
1052
|
+
name=name,
|
1053
|
+
description=description,
|
1054
|
+
path=path,
|
1055
|
+
is_collection=False,
|
1056
|
+
parent_id=parent_id,
|
1057
|
+
dataset_id=dataset_id,
|
1058
|
+
)
|
1059
|
+
data_storage_response = self._create_data_storage_entry(payload)
|
1060
|
+
|
1061
|
+
if not data_storage_response.signed_url:
|
1062
|
+
raise DataStorageCreationError("No signed URL returned from server")
|
1063
|
+
|
1064
|
+
with tqdm(
|
1065
|
+
total=file_size,
|
1066
|
+
unit="B",
|
1067
|
+
unit_scale=True,
|
1068
|
+
unit_divisor=1024,
|
1069
|
+
desc=f"Uploading {file_path.name}",
|
1070
|
+
miniters=1,
|
1071
|
+
mininterval=0.1,
|
1072
|
+
leave=False,
|
1073
|
+
) as pbar:
|
1074
|
+
_upload_file_with_progress(
|
1075
|
+
data_storage_response.signed_url, file_path, pbar, file_size
|
1076
|
+
)
|
1077
|
+
|
1078
|
+
status_response = self.client.patch(
|
1079
|
+
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
1080
|
+
json={"status": "active"},
|
1081
|
+
)
|
1082
|
+
status_response.raise_for_status()
|
1083
|
+
|
1084
|
+
return DataStorageResponse.model_validate(status_response.json())
|
1085
|
+
|
1086
|
+
def _process_file_item(
|
1087
|
+
self,
|
1088
|
+
item: Path,
|
1089
|
+
dir_manifest: DirectoryManifest,
|
1090
|
+
current_parent_id: UUID,
|
1091
|
+
dataset_id: UUID | None = None,
|
1092
|
+
) -> DataStorageResponse | None:
|
1093
|
+
"""Process a single file item for upload."""
|
1094
|
+
try:
|
1095
|
+
manifest_desc = dir_manifest.get_entry_description(item.name)
|
1096
|
+
file_description = manifest_desc or f"File: {item.name}"
|
1097
|
+
|
1098
|
+
logger.debug(
|
1099
|
+
f"Processing file {item.name} with description: '{file_description}'"
|
1100
|
+
)
|
1101
|
+
|
1102
|
+
return self._upload_data_single_file_with_parent(
|
1103
|
+
name=item.name,
|
1104
|
+
file_path=item,
|
1105
|
+
description=file_description,
|
1106
|
+
path=None,
|
1107
|
+
parent_id=current_parent_id,
|
1108
|
+
dataset_id=dataset_id,
|
1109
|
+
)
|
1110
|
+
except Exception as e:
|
1111
|
+
logger.error(f"Failed to upload file {item}: {e}")
|
1112
|
+
return None
|
1113
|
+
|
1114
|
+
def _upload_directory_hierarchically(
|
1115
|
+
self,
|
1116
|
+
name: str,
|
1117
|
+
dir_path: Path,
|
1118
|
+
description: str | None = None,
|
1119
|
+
manifest_filename: str | None = None,
|
1120
|
+
parent_id: UUID | None = None,
|
1121
|
+
ignore_patterns: list[str] | None = None,
|
1122
|
+
ignore_filename: str = ".gitignore",
|
1123
|
+
base_dir: Path | None = None,
|
1124
|
+
dir_manifest: DirectoryManifest | None = None,
|
1125
|
+
dataset_id: UUID | None = None,
|
1126
|
+
) -> list[DataStorageResponse]:
|
1127
|
+
"""Upload a directory with single dataset and individual file storage entries."""
|
1128
|
+
responses = []
|
1129
|
+
if parent_id is None:
|
1130
|
+
base_dir = dir_path
|
1131
|
+
all_ignore_patterns = _collect_ignore_patterns(
|
1132
|
+
base_dir, ignore_patterns, ignore_filename
|
1133
|
+
)
|
1134
|
+
|
1135
|
+
payload = DataStorageRequestPayload(
|
1136
|
+
name=name,
|
1137
|
+
description=description,
|
1138
|
+
parent_id=None,
|
1139
|
+
dataset_id=None,
|
1140
|
+
is_collection=False,
|
1141
|
+
)
|
1142
|
+
|
1143
|
+
dir_response = self._create_data_storage_entry(payload)
|
1144
|
+
responses.append(dir_response)
|
1145
|
+
current_parent_id = dir_response.data_storage.id
|
1146
|
+
current_dataset_id = dir_response.data_storage.dataset_id
|
1147
|
+
|
1148
|
+
dir_manifest = self._load_directory_manifest(
|
1149
|
+
manifest_filename, parent_id, dir_path
|
1150
|
+
)
|
1151
|
+
else:
|
1152
|
+
all_ignore_patterns = ignore_patterns or []
|
1153
|
+
current_parent_id = parent_id
|
1154
|
+
current_dataset_id = dataset_id
|
1155
|
+
|
1156
|
+
for item in dir_path.iterdir():
|
1157
|
+
if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
|
1158
|
+
continue
|
1159
|
+
|
1160
|
+
if item.is_dir():
|
1161
|
+
subdir_manifest = DirectoryManifest()
|
1162
|
+
if dir_manifest:
|
1163
|
+
entry = dir_manifest.entries.get(item.name)
|
1164
|
+
if isinstance(entry, DirectoryManifest):
|
1165
|
+
subdir_manifest = entry
|
1166
|
+
elif isinstance(entry, ManifestEntry):
|
1167
|
+
# Convert single entry to manifest
|
1168
|
+
subdir_manifest = DirectoryManifest(entries={item.name: entry})
|
1169
|
+
|
1170
|
+
subdir_description = subdir_manifest.get_entry_description(item.name)
|
1171
|
+
if not subdir_description:
|
1172
|
+
subdir_description = self._generate_folder_description_from_files(
|
1173
|
+
item, subdir_manifest
|
1174
|
+
)
|
1175
|
+
|
1176
|
+
subdir_payload = DataStorageRequestPayload(
|
1177
|
+
name=item.name,
|
1178
|
+
description=subdir_description,
|
1179
|
+
parent_id=current_parent_id,
|
1180
|
+
dataset_id=current_dataset_id,
|
1181
|
+
is_collection=False,
|
1182
|
+
)
|
1183
|
+
subdir_response = self._create_data_storage_entry(subdir_payload)
|
1184
|
+
responses.append(subdir_response)
|
1185
|
+
|
1186
|
+
subdir_responses = self._upload_directory_hierarchically(
|
1187
|
+
name=item.name,
|
1188
|
+
dir_path=item,
|
1189
|
+
description=None,
|
1190
|
+
manifest_filename=None,
|
1191
|
+
parent_id=subdir_response.data_storage.id,
|
1192
|
+
ignore_patterns=all_ignore_patterns,
|
1193
|
+
ignore_filename=ignore_filename,
|
1194
|
+
base_dir=base_dir,
|
1195
|
+
dir_manifest=subdir_manifest,
|
1196
|
+
dataset_id=current_dataset_id,
|
1197
|
+
)
|
1198
|
+
responses.extend(subdir_responses)
|
1199
|
+
elif item.is_file():
|
1200
|
+
file_response = self._process_file_item(
|
1201
|
+
item,
|
1202
|
+
dir_manifest or DirectoryManifest(),
|
1203
|
+
current_parent_id,
|
1204
|
+
current_dataset_id,
|
1205
|
+
)
|
1206
|
+
if file_response:
|
1207
|
+
responses.append(file_response)
|
1208
|
+
|
1209
|
+
return responses
|
1210
|
+
|
1211
|
+
def _load_directory_manifest(
|
1212
|
+
self,
|
1213
|
+
manifest_filename: str | None,
|
1214
|
+
parent_id: UUID | None,
|
1215
|
+
dir_path: Path,
|
1216
|
+
) -> DirectoryManifest:
|
1217
|
+
"""Load directory manifest if available."""
|
1218
|
+
if manifest_filename and not parent_id:
|
1219
|
+
manifest_data = self._load_manifest(Path.cwd(), manifest_filename)
|
1220
|
+
dir_name = dir_path.name
|
1221
|
+
logger.debug(
|
1222
|
+
f"Loaded manifest entries: {list(manifest_data.entries.keys())}"
|
1223
|
+
)
|
1224
|
+
logger.debug(
|
1225
|
+
f"Looking for manifest entry with directory name: '{dir_name}'"
|
1226
|
+
)
|
1227
|
+
|
1228
|
+
entry = manifest_data.entries.get(dir_name)
|
1229
|
+
if isinstance(entry, DirectoryManifest):
|
1230
|
+
return entry
|
1231
|
+
if isinstance(entry, ManifestEntry):
|
1232
|
+
return DirectoryManifest(entries={dir_name: entry})
|
1233
|
+
logger.debug(
|
1234
|
+
f"No manifest entry found for '{dir_name}', available keys: {list(manifest_data.entries.keys())}"
|
1235
|
+
)
|
1236
|
+
return DirectoryManifest()
|
1237
|
+
return DirectoryManifest()
|
1238
|
+
|
1239
|
+
async def _aupload_data_single_file_with_parent(
|
1240
|
+
self,
|
1241
|
+
name: str,
|
1242
|
+
file_path: Path,
|
1243
|
+
description: str | None,
|
1244
|
+
path: str | None,
|
1245
|
+
parent_id: UUID | None,
|
1246
|
+
dataset_id: UUID | None = None,
|
1247
|
+
) -> DataStorageResponse:
|
1248
|
+
"""Asynchronously upload a single file with a parent ID."""
|
1249
|
+
file_size, text_payload = self._prepare_single_file_upload(
|
1250
|
+
name, file_path, description, path
|
1251
|
+
)
|
1252
|
+
|
1253
|
+
if text_payload:
|
1254
|
+
logger.debug("Sending file as text content with parent_id")
|
1255
|
+
text_payload.parent_id = parent_id
|
1256
|
+
text_payload.dataset_id = dataset_id
|
1257
|
+
return await self._acreate_data_storage_entry(text_payload)
|
1258
|
+
|
1259
|
+
logger.debug(
|
1260
|
+
f"Large/binary file ({file_size:,} bytes) - requesting signed URL for upload"
|
1261
|
+
)
|
1262
|
+
payload = DataStorageRequestPayload(
|
1263
|
+
name=name,
|
1264
|
+
description=description,
|
1265
|
+
path=path,
|
1266
|
+
is_collection=False,
|
1267
|
+
parent_id=parent_id,
|
1268
|
+
dataset_id=dataset_id,
|
1269
|
+
)
|
1270
|
+
data_storage_response = await self._acreate_data_storage_entry(payload)
|
1271
|
+
|
1272
|
+
if not data_storage_response.signed_url:
|
1273
|
+
raise DataStorageCreationError("No signed URL returned from server")
|
1274
|
+
|
1275
|
+
with tqdm(
|
1276
|
+
total=file_size,
|
1277
|
+
unit="B",
|
1278
|
+
unit_scale=True,
|
1279
|
+
unit_divisor=1024,
|
1280
|
+
desc=f"Uploading {file_path.name}",
|
1281
|
+
miniters=1,
|
1282
|
+
mininterval=0.1,
|
1283
|
+
) as pbar:
|
1284
|
+
await _aupload_file_with_progress(
|
1285
|
+
data_storage_response.signed_url, file_path, pbar, file_size
|
1286
|
+
)
|
1287
|
+
|
1288
|
+
status_response = await self.async_client.patch(
|
1289
|
+
f"/v0.1/data-storage/{data_storage_response.data_storage.id}",
|
1290
|
+
json={"status": "active"},
|
1291
|
+
)
|
1292
|
+
status_response.raise_for_status()
|
1293
|
+
|
1294
|
+
return DataStorageResponse.model_validate(status_response.json())
|
1295
|
+
|
1296
|
+
async def _aprocess_file_item(
|
1297
|
+
self,
|
1298
|
+
item: Path,
|
1299
|
+
dir_manifest: DirectoryManifest,
|
1300
|
+
current_parent_id: UUID,
|
1301
|
+
dataset_id: UUID | None = None,
|
1302
|
+
) -> DataStorageResponse | None:
|
1303
|
+
"""Asynchronously process a single file item for upload."""
|
1304
|
+
try:
|
1305
|
+
manifest_desc = dir_manifest.get_entry_description(item.name)
|
1306
|
+
file_description = manifest_desc or f"File: {item.name}"
|
1307
|
+
|
1308
|
+
logger.debug(
|
1309
|
+
f"Processing file {item.name} with description: '{file_description}'"
|
1310
|
+
)
|
1311
|
+
|
1312
|
+
return await self._aupload_data_single_file_with_parent(
|
1313
|
+
name=item.name,
|
1314
|
+
file_path=item,
|
1315
|
+
description=file_description,
|
1316
|
+
path=None,
|
1317
|
+
parent_id=current_parent_id,
|
1318
|
+
dataset_id=dataset_id,
|
1319
|
+
)
|
1320
|
+
except Exception as e:
|
1321
|
+
logger.error(f"Failed to upload file {item}: {e}")
|
1322
|
+
return None
|
1323
|
+
|
1324
|
+
async def _aupload_directory_hierarchically(
|
1325
|
+
self,
|
1326
|
+
name: str,
|
1327
|
+
dir_path: Path,
|
1328
|
+
description: str | None = None,
|
1329
|
+
manifest_filename: str | None = None,
|
1330
|
+
parent_id: UUID | None = None,
|
1331
|
+
ignore_patterns: list[str] | None = None,
|
1332
|
+
ignore_filename: str = ".gitignore",
|
1333
|
+
base_dir: Path | None = None,
|
1334
|
+
dir_manifest: DirectoryManifest | None = None,
|
1335
|
+
dataset_id: UUID | None = None,
|
1336
|
+
) -> list[DataStorageResponse]:
|
1337
|
+
"""Upload a directory with single dataset and individual file storage entries (async)."""
|
1338
|
+
responses = []
|
1339
|
+
|
1340
|
+
if parent_id is None:
|
1341
|
+
base_dir = dir_path
|
1342
|
+
all_ignore_patterns = _collect_ignore_patterns(
|
1343
|
+
base_dir, ignore_patterns, ignore_filename
|
1344
|
+
)
|
1345
|
+
|
1346
|
+
payload = DataStorageRequestPayload(
|
1347
|
+
name=name,
|
1348
|
+
description=description,
|
1349
|
+
parent_id=None,
|
1350
|
+
dataset_id=None,
|
1351
|
+
is_collection=False,
|
1352
|
+
)
|
1353
|
+
|
1354
|
+
dir_response = await self._acreate_data_storage_entry(payload)
|
1355
|
+
responses.append(dir_response)
|
1356
|
+
current_parent_id = dir_response.data_storage.id
|
1357
|
+
current_dataset_id = dir_response.data_storage.dataset_id
|
1358
|
+
|
1359
|
+
dir_manifest = self._load_directory_manifest(
|
1360
|
+
manifest_filename, parent_id, dir_path
|
1361
|
+
)
|
1362
|
+
else:
|
1363
|
+
all_ignore_patterns = ignore_patterns or []
|
1364
|
+
current_parent_id = parent_id
|
1365
|
+
current_dataset_id = dataset_id
|
1366
|
+
|
1367
|
+
for item in dir_path.iterdir():
|
1368
|
+
if base_dir and _should_ignore_file(item, base_dir, all_ignore_patterns):
|
1369
|
+
continue
|
1370
|
+
|
1371
|
+
if item.is_dir():
|
1372
|
+
subdir_manifest = DirectoryManifest()
|
1373
|
+
if dir_manifest:
|
1374
|
+
entry = dir_manifest.entries.get(item.name)
|
1375
|
+
if isinstance(entry, DirectoryManifest):
|
1376
|
+
subdir_manifest = entry
|
1377
|
+
elif isinstance(entry, ManifestEntry):
|
1378
|
+
subdir_manifest = DirectoryManifest(entries={item.name: entry})
|
1379
|
+
|
1380
|
+
subdir_description = subdir_manifest.get_entry_description(item.name)
|
1381
|
+
if not subdir_description:
|
1382
|
+
subdir_description = self._generate_folder_description_from_files(
|
1383
|
+
item, subdir_manifest
|
1384
|
+
)
|
1385
|
+
|
1386
|
+
subdir_payload = DataStorageRequestPayload(
|
1387
|
+
name=item.name,
|
1388
|
+
description=subdir_description,
|
1389
|
+
parent_id=current_parent_id,
|
1390
|
+
dataset_id=current_dataset_id,
|
1391
|
+
is_collection=False,
|
1392
|
+
)
|
1393
|
+
subdir_response = await self._acreate_data_storage_entry(subdir_payload)
|
1394
|
+
responses.append(subdir_response)
|
1395
|
+
|
1396
|
+
subdir_responses = await self._aupload_directory_hierarchically(
|
1397
|
+
name=item.name,
|
1398
|
+
dir_path=item,
|
1399
|
+
description=None,
|
1400
|
+
manifest_filename=None,
|
1401
|
+
parent_id=subdir_response.data_storage.id,
|
1402
|
+
ignore_patterns=all_ignore_patterns,
|
1403
|
+
ignore_filename=ignore_filename,
|
1404
|
+
base_dir=base_dir,
|
1405
|
+
dir_manifest=subdir_manifest,
|
1406
|
+
dataset_id=current_dataset_id,
|
1407
|
+
)
|
1408
|
+
responses.extend(subdir_responses)
|
1409
|
+
elif item.is_file():
|
1410
|
+
file_response = await self._aprocess_file_item(
|
1411
|
+
item,
|
1412
|
+
dir_manifest or DirectoryManifest(),
|
1413
|
+
current_parent_id,
|
1414
|
+
current_dataset_id,
|
1415
|
+
)
|
1416
|
+
if file_response:
|
1417
|
+
responses.append(file_response)
|
1418
|
+
|
1419
|
+
return responses
|
1420
|
+
|
1421
|
+
@property
|
1422
|
+
def client(self) -> Client:
|
1423
|
+
raise NotImplementedError("client property must be implemented by subclass")
|
1424
|
+
|
1425
|
+
@property
|
1426
|
+
def async_client(self) -> AsyncClient:
|
1427
|
+
raise NotImplementedError(
|
1428
|
+
"async_client property must be implemented by subclass"
|
1429
|
+
)
|
1430
|
+
|
1431
|
+
@retry(
|
1432
|
+
stop=stop_after_attempt(3),
|
1433
|
+
wait=wait_exponential(multiplier=1, max=10),
|
1434
|
+
retry=retry_if_connection_error,
|
1435
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
1436
|
+
)
|
1437
|
+
def store_text_content(
|
1438
|
+
self,
|
1439
|
+
name: str,
|
1440
|
+
content: str,
|
1441
|
+
description: str | None = None,
|
1442
|
+
path: str | None = None,
|
1443
|
+
) -> DataStorageResponse:
|
1444
|
+
"""Store content as a string in the data storage system.
|
1445
|
+
|
1446
|
+
Args:
|
1447
|
+
name: Name of the data storage entry
|
1448
|
+
content: Content to store as a string
|
1449
|
+
description: Optional description of the data storage entry
|
1450
|
+
path: Optional path for the data storage entry
|
1451
|
+
|
1452
|
+
Returns:
|
1453
|
+
DataStorageResponse containing the created data storage entry and storage locations
|
1454
|
+
|
1455
|
+
Raises:
|
1456
|
+
DataStorageCreationError: If there's an error creating the data storage entry
|
1457
|
+
"""
|
1458
|
+
try:
|
1459
|
+
payload = DataStorageRequestPayload(
|
1460
|
+
name=name,
|
1461
|
+
content=content,
|
1462
|
+
description=description,
|
1463
|
+
path=path,
|
1464
|
+
)
|
1465
|
+
return self._create_data_storage_entry(payload)
|
1466
|
+
except HTTPStatusError as e:
|
1467
|
+
self._handle_http_errors(e)
|
1468
|
+
except Exception as e:
|
1469
|
+
raise DataStorageCreationError(
|
1470
|
+
f"An unexpected error occurred: {e!r}"
|
1471
|
+
) from e
|
1472
|
+
|
1473
|
+
@retry(
|
1474
|
+
stop=stop_after_attempt(3),
|
1475
|
+
wait=wait_exponential(multiplier=1, max=10),
|
1476
|
+
retry=retry_if_connection_error,
|
1477
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
1478
|
+
)
|
1479
|
+
async def astore_text_content(
|
1480
|
+
self,
|
1481
|
+
name: str,
|
1482
|
+
content: str,
|
1483
|
+
description: str | None = None,
|
1484
|
+
path: str | None = None,
|
1485
|
+
) -> DataStorageResponse:
|
1486
|
+
"""Asynchronously store content as a string in the data storage system.
|
1487
|
+
|
1488
|
+
Args:
|
1489
|
+
name: Name of the data storage entry
|
1490
|
+
content: Content to store as a string
|
1491
|
+
description: Optional description of the data storage entry
|
1492
|
+
path: Optional path for the data storage entry
|
1493
|
+
|
1494
|
+
Returns:
|
1495
|
+
DataStorageResponse containing the created data storage entry and storage locations
|
1496
|
+
|
1497
|
+
Raises:
|
1498
|
+
DataStorageCreationError: If there's an error creating the data storage entry
|
1499
|
+
"""
|
1500
|
+
try:
|
1501
|
+
payload = DataStorageRequestPayload(
|
1502
|
+
name=name,
|
1503
|
+
content=content,
|
1504
|
+
description=description,
|
1505
|
+
path=path,
|
1506
|
+
)
|
1507
|
+
return await self._acreate_data_storage_entry(payload)
|
1508
|
+
except HTTPStatusError as e:
|
1509
|
+
self._handle_http_errors(e)
|
1510
|
+
except Exception as e:
|
1511
|
+
raise DataStorageCreationError(
|
1512
|
+
f"An unexpected error occurred: {e!r}"
|
1513
|
+
) from e
|
1514
|
+
|
1515
|
+
@retry(
|
1516
|
+
stop=stop_after_attempt(3),
|
1517
|
+
wait=wait_exponential(multiplier=1, max=10),
|
1518
|
+
retry=retry_if_connection_error,
|
1519
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
1520
|
+
)
|
1521
|
+
def store_file_content(
|
1522
|
+
self,
|
1523
|
+
name: str,
|
1524
|
+
file_path: str | Path,
|
1525
|
+
description: str | None = None,
|
1526
|
+
path: str | None = None,
|
1527
|
+
as_collection: bool = False,
|
1528
|
+
manifest_filename: str | None = None,
|
1529
|
+
ignore_patterns: list[str] | None = None,
|
1530
|
+
ignore_filename: str = ".gitignore",
|
1531
|
+
) -> DataStorageResponse:
|
1532
|
+
"""Store file or directory content in the data storage system.
|
1533
|
+
|
1534
|
+
For files: Small text files (< 10MB, supported formats) are sent as text content,
|
1535
|
+
larger/binary files are uploaded via signed URL.
|
1536
|
+
|
1537
|
+
For directories: Zipped as a single file with ignore pattern support and uploaded
|
1538
|
+
as a collection.
|
1539
|
+
|
1540
|
+
Args:
|
1541
|
+
name: Name of the data storage entry
|
1542
|
+
file_path: Path to file or directory to upload
|
1543
|
+
description: Optional description of the data storage entry
|
1544
|
+
path: Optional path for the data storage entry
|
1545
|
+
as_collection: If true, upload directories as a single zip file collection.
|
1546
|
+
manifest_filename: Name of manifest file
|
1547
|
+
ignore_patterns: List of patterns to ignore when zipping directories
|
1548
|
+
ignore_filename: Name of ignore file to read from directory (default: .gitignore)
|
1549
|
+
|
1550
|
+
Returns:
|
1551
|
+
DataStorageResponse containing the final data storage entry
|
1552
|
+
|
1553
|
+
Raises:
|
1554
|
+
DataStorageCreationError: If there's an error in the process
|
1555
|
+
"""
|
1556
|
+
file_path = self._validate_file_path(file_path)
|
1557
|
+
|
1558
|
+
try:
|
1559
|
+
if file_path.is_dir() and as_collection:
|
1560
|
+
return self._upload_data_directory(
|
1561
|
+
name, file_path, description, path, ignore_patterns, ignore_filename
|
1562
|
+
)
|
1563
|
+
if file_path.is_dir() and not as_collection:
|
1564
|
+
responses = self._upload_directory_hierarchically(
|
1565
|
+
name=name,
|
1566
|
+
dir_path=file_path,
|
1567
|
+
description=description,
|
1568
|
+
manifest_filename=manifest_filename,
|
1569
|
+
ignore_patterns=ignore_patterns,
|
1570
|
+
ignore_filename=ignore_filename,
|
1571
|
+
)
|
1572
|
+
if not responses:
|
1573
|
+
raise DataStorageCreationError(
|
1574
|
+
"No data storage entries were created"
|
1575
|
+
)
|
1576
|
+
return responses[0]
|
1577
|
+
return self._upload_data_single_file(name, file_path, description, path)
|
1578
|
+
|
1579
|
+
except HTTPStatusError as e:
|
1580
|
+
self._handle_http_errors(e)
|
1581
|
+
except Exception as e:
|
1582
|
+
raise DataStorageCreationError(
|
1583
|
+
f"An unexpected error occurred during file upload: {e!r}"
|
1584
|
+
) from e
|
1585
|
+
|
1586
|
+
@retry(
|
1587
|
+
stop=stop_after_attempt(3),
|
1588
|
+
wait=wait_exponential(multiplier=1, max=10),
|
1589
|
+
retry=retry_if_connection_error,
|
1590
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
1591
|
+
)
|
1592
|
+
async def astore_file_content(
|
1593
|
+
self,
|
1594
|
+
name: str,
|
1595
|
+
file_path: str | Path,
|
1596
|
+
description: str | None = None,
|
1597
|
+
path: str | None = None,
|
1598
|
+
as_collection: bool = False,
|
1599
|
+
manifest_filename: str | None = None,
|
1600
|
+
ignore_patterns: list[str] | None = None,
|
1601
|
+
ignore_filename: str = ".gitignore",
|
1602
|
+
) -> DataStorageResponse:
|
1603
|
+
"""Asynchronously store file or directory content in the data storage system.
|
1604
|
+
|
1605
|
+
Args:
|
1606
|
+
name: Name of the data storage entry.
|
1607
|
+
file_path: Path to the file or directory to upload.
|
1608
|
+
description: Optional description for the entry.
|
1609
|
+
path: Optional GCS path for the entry.
|
1610
|
+
as_collection: If uploading a directory, `True` zips it into a single collection,
|
1611
|
+
`False` uploads it as a hierarchical structure of individual objects.
|
1612
|
+
manifest_filename: Optional manifest file for hierarchical uploads.
|
1613
|
+
ignore_patterns: List of patterns to ignore when zipping.
|
1614
|
+
ignore_filename: Name of ignore file to read (default: .gitignore).
|
1615
|
+
|
1616
|
+
Returns:
|
1617
|
+
The `DataStorageResponse` for the created entry. For hierarchical uploads,
|
1618
|
+
this is the response for the root directory entry.
|
1619
|
+
"""
|
1620
|
+
file_path = self._validate_file_path(file_path)
|
1621
|
+
|
1622
|
+
try:
|
1623
|
+
if file_path.is_dir():
|
1624
|
+
if as_collection:
|
1625
|
+
return await self._aupload_data_directory(
|
1626
|
+
name,
|
1627
|
+
file_path,
|
1628
|
+
description,
|
1629
|
+
path,
|
1630
|
+
ignore_patterns,
|
1631
|
+
ignore_filename,
|
1632
|
+
)
|
1633
|
+
responses = await self._aupload_directory_hierarchically(
|
1634
|
+
name=name,
|
1635
|
+
dir_path=file_path,
|
1636
|
+
description=description,
|
1637
|
+
manifest_filename=manifest_filename,
|
1638
|
+
ignore_patterns=ignore_patterns,
|
1639
|
+
ignore_filename=ignore_filename,
|
1640
|
+
)
|
1641
|
+
if not responses:
|
1642
|
+
raise DataStorageCreationError(
|
1643
|
+
"No data storage entries were created"
|
1644
|
+
)
|
1645
|
+
return responses[0]
|
1646
|
+
return await self._aupload_data_single_file(
|
1647
|
+
name, file_path, description, path
|
1648
|
+
)
|
1649
|
+
|
1650
|
+
except HTTPStatusError as e:
|
1651
|
+
self._handle_http_errors(e)
|
1652
|
+
except Exception as e:
|
1653
|
+
raise DataStorageCreationError(
|
1654
|
+
f"An unexpected error occurred during async file upload: {e!r}"
|
1655
|
+
) from e
|
1656
|
+
|
1657
|
+
@retry(
|
1658
|
+
stop=stop_after_attempt(3),
|
1659
|
+
wait=wait_exponential(multiplier=1, max=10),
|
1660
|
+
retry=retry_if_connection_error,
|
1661
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
1662
|
+
)
|
1663
|
+
def register_existing_data_source(
|
1664
|
+
self,
|
1665
|
+
name: str,
|
1666
|
+
existing_location: DataStorageLocationPayload,
|
1667
|
+
description: str | None = None,
|
1668
|
+
path: str | None = None,
|
1669
|
+
) -> DataStorageResponse:
|
1670
|
+
"""Store content as a string in the data storage system.
|
1671
|
+
|
1672
|
+
Args:
|
1673
|
+
name: Name of the data storage entry
|
1674
|
+
existing_location: Describes the existing data source location to register
|
1675
|
+
description: Optional description of the data storage entry
|
1676
|
+
path: Optional path for the data storage entry
|
1677
|
+
|
1678
|
+
Returns:
|
1679
|
+
DataStorageResponse containing the created data storage entry and storage locations
|
1680
|
+
|
1681
|
+
Raises:
|
1682
|
+
DataStorageCreationError: If there's an error creating the data storage entry
|
1683
|
+
"""
|
1684
|
+
try:
|
1685
|
+
payload = DataStorageRequestPayload(
|
1686
|
+
name=name,
|
1687
|
+
description=description,
|
1688
|
+
path=path,
|
1689
|
+
existing_location=existing_location,
|
1690
|
+
)
|
1691
|
+
response = self.client.post(
|
1692
|
+
"/v0.1/data-storage", json=payload.model_dump(exclude_none=True)
|
1693
|
+
)
|
1694
|
+
response.raise_for_status()
|
1695
|
+
return DataStorageResponse.model_validate(response.json())
|
1696
|
+
except HTTPStatusError as e:
|
1697
|
+
self._handle_http_errors(e)
|
1698
|
+
except Exception as e:
|
1699
|
+
raise DataStorageCreationError(
|
1700
|
+
f"An unexpected error occurred: {e!r}"
|
1701
|
+
) from e
|
1702
|
+
|
1703
|
+
@retry(
|
1704
|
+
stop=stop_after_attempt(3),
|
1705
|
+
wait=wait_exponential(multiplier=1, max=10),
|
1706
|
+
retry=retry_if_connection_error,
|
1707
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
1708
|
+
)
|
1709
|
+
async def aregister_existing_data_source(
|
1710
|
+
self,
|
1711
|
+
name: str,
|
1712
|
+
existing_location: DataStorageLocationPayload,
|
1713
|
+
description: str | None = None,
|
1714
|
+
path: str | None = None,
|
1715
|
+
) -> DataStorageResponse:
|
1716
|
+
"""Store content as a string in the data storage system.
|
1717
|
+
|
1718
|
+
Args:
|
1719
|
+
name: Name of the data storage entry
|
1720
|
+
existing_location: Describes the existing data source location to register
|
1721
|
+
description: Optional description of the data storage entry
|
1722
|
+
path: Optional path for the data storage entry
|
1723
|
+
|
1724
|
+
Returns:
|
1725
|
+
DataStorageResponse containing the created data storage entry and storage locations
|
1726
|
+
|
1727
|
+
Raises:
|
1728
|
+
DataStorageCreationError: If there's an error creating the data storage entry
|
1729
|
+
"""
|
1730
|
+
try:
|
1731
|
+
payload = DataStorageRequestPayload(
|
1732
|
+
name=name,
|
1733
|
+
description=description,
|
1734
|
+
path=path,
|
1735
|
+
existing_location=existing_location,
|
1736
|
+
)
|
1737
|
+
response = await self.async_client.post(
|
1738
|
+
"/v0.1/data-storage", json=payload.model_dump(exclude_none=True)
|
1739
|
+
)
|
1740
|
+
response.raise_for_status()
|
1741
|
+
return DataStorageResponse.model_validate(response.json())
|
1742
|
+
except HTTPStatusError as e:
|
1743
|
+
self._handle_http_errors(e)
|
1744
|
+
except Exception as e:
|
1745
|
+
raise DataStorageCreationError(
|
1746
|
+
f"An unexpected error occurred: {e!r}"
|
1747
|
+
) from e
|
1748
|
+
|
1749
|
+
# TODO: EVERYTHING BELOW THIS LINE SHOULD BE MOVED TO FH_TOOLS REPO
|
1750
|
+
# =================================================
|
1751
|
+
@retry(
|
1752
|
+
stop=stop_after_attempt(3),
|
1753
|
+
wait=wait_exponential(multiplier=1, max=10),
|
1754
|
+
retry=retry_if_connection_error,
|
1755
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
1756
|
+
)
|
1757
|
+
def fetch_data_from_storage(
|
1758
|
+
self,
|
1759
|
+
data_storage_id: UUID | None = None,
|
1760
|
+
) -> str | Path | None:
|
1761
|
+
"""Fetch data from the storage system (sync version).
|
1762
|
+
|
1763
|
+
Args:
|
1764
|
+
data_storage_id: ID of the data storage entry to fetch
|
1765
|
+
|
1766
|
+
Returns:
|
1767
|
+
For PG_TABLE storage: string content
|
1768
|
+
For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
|
1769
|
+
None if not found or error occurred
|
1770
|
+
"""
|
1771
|
+
if not data_storage_id:
|
1772
|
+
raise DataStorageCreationError(
|
1773
|
+
"data_storage_id must be provided at this time"
|
1774
|
+
)
|
1775
|
+
|
1776
|
+
try:
|
1777
|
+
response = self.client.get(f"/v0.1/data-storage/{data_storage_id}")
|
1778
|
+
response.raise_for_status()
|
1779
|
+
result = DataStorageResponse.model_validate(response.json())
|
1780
|
+
|
1781
|
+
storage_type = result.storage_location.storage_config.storage_type
|
1782
|
+
|
1783
|
+
if storage_type == "gcs":
|
1784
|
+
if not result.signed_url:
|
1785
|
+
raise DataStorageCreationError(
|
1786
|
+
"No signed URL available for GCS download"
|
1787
|
+
)
|
1788
|
+
|
1789
|
+
return self._download_from_gcs(result.signed_url)
|
1790
|
+
|
1791
|
+
if storage_type == "raw_content":
|
1792
|
+
content = result.data_storage.content
|
1793
|
+
if content is None:
|
1794
|
+
logger.warning(
|
1795
|
+
f"No content found for data storage entry {data_storage_id}"
|
1796
|
+
)
|
1797
|
+
return None
|
1798
|
+
return content
|
1799
|
+
|
1800
|
+
raise DataStorageCreationError(f"Unsupported storage type: {storage_type}")
|
1801
|
+
|
1802
|
+
except HTTPStatusError as e:
|
1803
|
+
self._handle_http_errors(e)
|
1804
|
+
except Exception as e:
|
1805
|
+
raise DataStorageCreationError(
|
1806
|
+
f"An unexpected error occurred: {e!r}"
|
1807
|
+
) from e
|
1808
|
+
|
1809
|
+
@retry(
|
1810
|
+
stop=stop_after_attempt(3),
|
1811
|
+
wait=wait_exponential(multiplier=1, max=10),
|
1812
|
+
retry=retry_if_connection_error,
|
1813
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
1814
|
+
)
|
1815
|
+
async def afetch_data_from_storage(
|
1816
|
+
self,
|
1817
|
+
data_storage_id: UUID | None = None,
|
1818
|
+
) -> str | Path | None:
|
1819
|
+
"""Fetch data from the storage system.
|
1820
|
+
|
1821
|
+
Args:
|
1822
|
+
data_storage_id: ID of the data storage entry to fetch
|
1823
|
+
|
1824
|
+
Returns:
|
1825
|
+
For PG_TABLE storage: string content
|
1826
|
+
For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
|
1827
|
+
None if not found or error occurred
|
1828
|
+
"""
|
1829
|
+
if not data_storage_id:
|
1830
|
+
raise DataStorageCreationError(
|
1831
|
+
"data_storage_id must be provided at this time"
|
1832
|
+
)
|
1833
|
+
|
1834
|
+
try:
|
1835
|
+
response = await self.async_client.get(
|
1836
|
+
f"/v0.1/data-storage/{data_storage_id}"
|
1837
|
+
)
|
1838
|
+
response.raise_for_status()
|
1839
|
+
result = DataStorageResponse.model_validate(response.json())
|
1840
|
+
|
1841
|
+
storage_type = result.storage_location.storage_config.storage_type
|
1842
|
+
|
1843
|
+
if storage_type == "gcs":
|
1844
|
+
if not result.signed_url:
|
1845
|
+
raise DataStorageCreationError(
|
1846
|
+
"No signed URL available for GCS download"
|
1847
|
+
)
|
1848
|
+
|
1849
|
+
return await self._adownload_from_gcs(result.signed_url)
|
1850
|
+
|
1851
|
+
if storage_type == "raw_content":
|
1852
|
+
content = result.data_storage.content
|
1853
|
+
if content is None:
|
1854
|
+
logger.warning(
|
1855
|
+
f"No content found for data storage entry {data_storage_id}"
|
1856
|
+
)
|
1857
|
+
return None
|
1858
|
+
return content
|
1859
|
+
|
1860
|
+
raise DataStorageCreationError(f"Unsupported storage type: {storage_type}")
|
1861
|
+
|
1862
|
+
except HTTPStatusError as e:
|
1863
|
+
self._handle_http_errors(e)
|
1864
|
+
except Exception as e:
|
1865
|
+
raise DataStorageCreationError(
|
1866
|
+
f"An unexpected error occurred: {e!r}"
|
1867
|
+
) from e
|