tilebox-storage 0.41.0__tar.gz → 0.42.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tilebox-storage
3
- Version: 0.41.0
3
+ Version: 0.42.0
4
4
  Summary: Storage client for Tilebox
5
5
  Project-URL: Homepage, https://tilebox.com
6
6
  Project-URL: Documentation, https://docs.tilebox.com/
@@ -20,10 +20,9 @@ Classifier: Topic :: Scientific/Engineering
20
20
  Classifier: Topic :: Software Development
21
21
  Requires-Python: >=3.10
22
22
  Requires-Dist: aiofile>=3.8
23
- Requires-Dist: boto3-stubs[essential]>=1.33
24
- Requires-Dist: boto3>=1.33
25
23
  Requires-Dist: folium>=0.15
26
24
  Requires-Dist: httpx>=0.27
25
+ Requires-Dist: obstore>=0.8.0
27
26
  Requires-Dist: shapely>=2
28
27
  Requires-Dist: tilebox-datasets
29
28
  Description-Content-Type: text/markdown
@@ -26,8 +26,7 @@ dependencies = [
26
26
  "aiofile>=3.8",
27
27
  "folium>=0.15",
28
28
  "shapely>=2",
29
- "boto3>=1.33",
30
- "boto3-stubs[essential]>=1.33",
29
+ "obstore>=0.8.0",
31
30
  ]
32
31
 
33
32
  [dependency-groups]
@@ -37,7 +36,6 @@ dev = [
37
36
  "pytest-asyncio>=0.24.0",
38
37
  "pytest-cov>=5.0.0",
39
38
  "pytest>=8.3.2",
40
- "moto>=5",
41
39
  ]
42
40
 
43
41
  [project.urls]
@@ -3,6 +3,7 @@ from pathlib import Path
3
3
  from tilebox.storage.aio import ASFStorageClient as _ASFStorageClient
4
4
  from tilebox.storage.aio import CopernicusStorageClient as _CopernicusStorageClient
5
5
  from tilebox.storage.aio import UmbraStorageClient as _UmbraStorageClient
6
+ from tilebox.storage.aio import USGSLandsatStorageClient as _USGSLandsatStorageClient
6
7
 
7
8
 
8
9
  class ASFStorageClient(_ASFStorageClient):
@@ -50,3 +51,18 @@ class CopernicusStorageClient(_CopernicusStorageClient):
50
51
  """
51
52
  super().__init__(access_key, secret_access_key, cache_directory)
52
53
  self._syncify()
54
+
55
+
56
+ class USGSLandsatStorageClient(_USGSLandsatStorageClient):
57
+ def __init__(self, cache_directory: Path | None = Path.home() / ".cache" / "tilebox") -> None:
58
+ """A tilebox storage client that downloads data from the USGS Landsat S3 bucket.
59
+
60
+ This client handles the requester-pays nature of the bucket and provides methods for listing and downloading
61
+ data.
62
+
63
+ Args:
64
+ cache_directory: The directory to store downloaded data in. Defaults to ~/.cache/tilebox. If set to None
65
+ no cache is used and the `output_dir` parameter will need be set when downloading data.
66
+ """
67
+ super().__init__(cache_directory)
68
+ self._syncify()
@@ -1,26 +1,31 @@
1
+ import asyncio
1
2
  import hashlib
2
3
  import os
3
4
  import shutil
4
5
  import tempfile
5
- import warnings
6
6
  import zipfile
7
- from collections.abc import AsyncIterator, Callable, Iterator
7
+ from asyncio import Queue, QueueEmpty
8
+ from collections.abc import AsyncIterator
8
9
  from pathlib import Path
9
- from typing import IO, Any
10
+ from typing import Any, TypeAlias
10
11
 
11
12
  import anyio
12
- import boto3
13
+ import obstore as obs
14
+ import xarray as xr
13
15
  from aiofile import async_open
14
- from botocore import UNSIGNED
15
- from botocore.client import Config
16
16
  from httpx import AsyncClient
17
- from mypy_boto3_s3 import S3Client
18
- from mypy_boto3_s3.type_defs import ObjectTypeDef
17
+ from obstore.auth.boto3 import Boto3CredentialProvider
18
+ from obstore.store import GCSStore, LocalStore, S3Store
19
19
  from tqdm.auto import tqdm
20
20
 
21
21
  from _tilebox.grpc.aio.producer_consumer import async_producer_consumer
22
22
  from _tilebox.grpc.aio.syncify import Syncifiable
23
- from tilebox.storage.granule import ASFStorageGranule, CopernicusStorageGranule, UmbraStorageGranule
23
+ from tilebox.storage.granule import (
24
+ ASFStorageGranule,
25
+ CopernicusStorageGranule,
26
+ UmbraStorageGranule,
27
+ USGSLandsatStorageGranule,
28
+ )
24
29
  from tilebox.storage.providers import login
25
30
 
26
31
  try:
@@ -38,7 +43,7 @@ except ImportError:
38
43
  raise RuntimeError("IPython is not available. Diagram can only be displayed in a notebook.")
39
44
 
40
45
 
41
- import xarray as xr
46
+ ObjectStore: TypeAlias = S3Store | LocalStore | GCSStore
42
47
 
43
48
 
44
49
  class _HttpClient(Syncifiable):
@@ -47,6 +52,10 @@ class _HttpClient(Syncifiable):
47
52
  self._clients: dict[str, AsyncClient] = {}
48
53
  self._auth = auth
49
54
 
55
+ def __del__(self) -> None:
56
+ for client in self._clients.values():
57
+ asyncio.run(client.aclose())
58
+
50
59
  async def download_quicklook(
51
60
  self, datapoint: xr.Dataset | ASFStorageGranule, output_dir: Path | None = None
52
61
  ) -> Path:
@@ -94,7 +103,7 @@ class _HttpClient(Syncifiable):
94
103
  image_data = await self._download_quicklook(granule)
95
104
  assert granule.urls.quicklook is not None # otherwise _download_quicklook would have raised a ValueError
96
105
  image_name = granule.urls.quicklook.rsplit("/", 1)[-1]
97
- _display_quicklook(image_data, image_name, granule.time.year, width, height)
106
+ _display_quicklook(image_data, width, height, f"<code>Image {image_name} © ASF {granule.time.year}</code>")
98
107
 
99
108
  async def _download_quicklook(self, granule: ASFStorageGranule) -> bytes:
100
109
  """Download a granules quicklook image into a memory buffer."""
@@ -224,10 +233,10 @@ class _HttpClient(Syncifiable):
224
233
  return client
225
234
 
226
235
 
227
- def _display_quicklook(image_data: bytes | Path, image_name: str, year: int, width: int, height: int) -> None:
236
+ def _display_quicklook(image_data: bytes | Path, width: int, height: int, image_caption: str | None = None) -> None:
228
237
  display(Image(image_data, width=width, height=height))
229
- image_copyright = HTML(f"<code>Image {image_name} © ESA {year}</code>")
230
- display(image_copyright)
238
+ if image_caption is not None:
239
+ display(HTML(image_caption))
231
240
 
232
241
 
233
242
  class StorageClient(Syncifiable):
@@ -248,6 +257,71 @@ class StorageClient(Syncifiable):
248
257
  shutil.rmtree(self._cache)
249
258
 
250
259
 
260
+ async def list_object_paths(store: ObjectStore, prefix: str) -> list[str]:
261
+ objects = await obs.list(store, prefix).collect_async()
262
+ prefix_path = Path(prefix)
263
+ return sorted(str(Path(obj["path"]).relative_to(prefix_path)) for obj in objects)
264
+
265
+
266
+ async def download_objects( # noqa: PLR0913
267
+ store: ObjectStore,
268
+ prefix: str,
269
+ objects: list[str],
270
+ output_dir: Path,
271
+ show_progress: bool = True,
272
+ max_concurrent_downloads: int = 10,
273
+ ) -> None:
274
+ queue = Queue()
275
+ for obj in objects:
276
+ await queue.put((prefix, obj))
277
+
278
+ max_concurrent_downloads = max(1, min(max_concurrent_downloads, len(objects)))
279
+ async with anyio.create_task_group() as task_group:
280
+ for _ in range(max_concurrent_downloads):
281
+ task_group.start_soon(_download_worker, store, queue, output_dir, show_progress)
282
+
283
+
284
+ async def _download_worker(
285
+ store: ObjectStore,
286
+ queue: Queue[tuple[str, str]],
287
+ output_dir: Path,
288
+ show_progress: bool = True,
289
+ ) -> None:
290
+ while True:
291
+ try:
292
+ prefix, obj = queue.get_nowait()
293
+ except QueueEmpty:
294
+ break
295
+
296
+ await _download_object(store, prefix, obj, output_dir, show_progress)
297
+
298
+
299
+ async def _download_object(
300
+ store: ObjectStore, prefix: str, obj: str, output_dir: Path, show_progress: bool = True
301
+ ) -> Path:
302
+ key = str(Path(prefix) / obj)
303
+ output_path = output_dir / obj
304
+ if output_path.exists(): # already cached
305
+ return output_path
306
+
307
+ output_path.parent.mkdir(parents=True, exist_ok=True)
308
+ download_path = output_path.parent / f"{output_path.name}.part"
309
+ response = await obs.get_async(store, key)
310
+ file_size = response.meta["size"]
311
+ with download_path.open("wb") as f:
312
+ if show_progress:
313
+ with tqdm(desc=obj, total=file_size, unit="B", unit_scale=True, unit_divisor=1024) as progress:
314
+ async for bytes_chunk in response:
315
+ f.write(bytes_chunk)
316
+ progress.update(len(bytes_chunk))
317
+ else:
318
+ async for bytes_chunk in response:
319
+ f.write(bytes_chunk)
320
+
321
+ shutil.move(download_path, output_path)
322
+ return output_path
323
+
324
+
251
325
  class ASFStorageClient(StorageClient):
252
326
  def __init__(self, user: str, password: str, cache_directory: Path = Path.home() / ".cache" / "tilebox") -> None:
253
327
  """A tilebox storage client that downloads data from the Alaska Satellite Facility.
@@ -342,7 +416,7 @@ class ASFStorageClient(StorageClient):
342
416
  if Image is None:
343
417
  raise ImportError("IPython is not available, please use download_preview instead.")
344
418
  quicklook = await self._download_quicklook(datapoint)
345
- _display_quicklook(quicklook, quicklook.name, granule.time.year, width, height)
419
+ _display_quicklook(quicklook, width, height, f"<code>Image {quicklook.name} © ASF {granule.time.year}</code>")
346
420
 
347
421
  async def _download_quicklook(self, datapoint: xr.Dataset | ASFStorageGranule) -> Path:
348
422
  granule = ASFStorageGranule.from_data(datapoint)
@@ -359,49 +433,15 @@ class ASFStorageClient(StorageClient):
359
433
  return await self._client.download_quicklook(datapoint, output_file.parent)
360
434
 
361
435
 
362
- class _S3Client:
363
- def __init__(self, s3: S3Client, bucket: str) -> None:
364
- self._bucket = bucket
365
- self._s3 = s3
366
-
367
- def list_objects(self, prefix: str) -> Iterator[ObjectTypeDef]:
368
- """Returns an iterator over the objects in the S3 bucket that starts with the given prefix."""
369
- paginator = self._s3.get_paginator("list_objects_v2")
370
- for page in paginator.paginate(Bucket=self._bucket, Prefix=prefix):
371
- yield from page.get("Contents", [])
372
-
373
- async def download_object( # noqa: PLR0913
374
- self, key: str, name: str, size: int, download_file: IO[Any], verify: bool, show_progress: bool
375
- ) -> None:
376
- """Download an object from S3 into a file."""
377
- progress = None
378
- if show_progress:
379
- progress = tqdm(
380
- desc=name,
381
- total=size,
382
- unit="B",
383
- unit_scale=True,
384
- unit_divisor=1024,
385
- )
386
-
387
- self._s3.download_fileobj(
388
- Bucket=self._bucket,
389
- Key=key,
390
- Fileobj=download_file,
391
- ExtraArgs={"ChecksumMode": "ENABLED"} if verify else None,
392
- Callback=progress.update if progress else None,
393
- )
394
-
395
- if progress is not None:
396
- if progress.total != progress.n:
397
- progress.n = progress.total
398
- progress.refresh()
399
- progress.close()
436
+ def _umbra_s3_prefix(datapoint: xr.Dataset | UmbraStorageGranule) -> str:
437
+ granule = UmbraStorageGranule.from_data(datapoint)
438
+ return f"sar-data/tasks/{granule.location}/"
400
439
 
401
440
 
402
441
  class UmbraStorageClient(StorageClient):
403
442
  _STORAGE_PROVIDER = "Umbra"
404
443
  _BUCKET = "umbra-open-data-catalog"
444
+ _REGION = "us-west-2"
405
445
 
406
446
  def __init__(self, cache_directory: Path | None = Path.home() / ".cache" / "tilebox") -> None:
407
447
  """A tilebox storage client that downloads data from the Umbra Open Data Catalog.
@@ -412,14 +452,9 @@ class UmbraStorageClient(StorageClient):
412
452
  """
413
453
  super().__init__(cache_directory)
414
454
 
415
- with warnings.catch_warnings():
416
- # https://github.com/boto/boto3/issues/3889
417
- warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*datetime.utcnow.*")
418
- boto3_client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
455
+ self._store: ObjectStore = S3Store(self._BUCKET, region=self._REGION, skip_signature=True)
419
456
 
420
- self._s3 = _S3Client(s3=boto3_client, bucket=self._BUCKET)
421
-
422
- def list_objects(self, datapoint: xr.Dataset | UmbraStorageGranule) -> list[str]:
457
+ async def list_objects(self, datapoint: xr.Dataset | UmbraStorageGranule) -> list[str]:
423
458
  """List all available objects for a given datapoint.
424
459
 
425
460
  Args:
@@ -427,38 +462,36 @@ class UmbraStorageClient(StorageClient):
427
462
 
428
463
  Returns:
429
464
  List of object keys available for the given datapoint, relative to the granule location."""
430
- granule = UmbraStorageGranule.from_data(datapoint)
431
- prefix = f"sar-data/tasks/{granule.location}/"
432
- keys = [object_metadata.get("Key") for object_metadata in self._s3.list_objects(prefix)]
433
- return [k.removeprefix(prefix) for k in keys if k is not None]
465
+ return await list_object_paths(self._store, _umbra_s3_prefix(datapoint))
434
466
 
435
467
  async def download(
436
468
  self,
437
469
  datapoint: xr.Dataset | UmbraStorageGranule,
438
470
  output_dir: Path | None = None,
439
- verify: bool = True,
440
471
  show_progress: bool = True,
472
+ max_concurrent_downloads: int = 4,
441
473
  ) -> Path:
442
474
  """Download the data for a given datapoint.
443
475
 
444
476
  Args:
445
477
  datapoint: The datapoint to download the data for.
446
478
  output_dir: The directory to download the data to. Optional, defaults to the cache directory.
447
- verify: Whether to verify the md5sum of the downloaded file. Defaults to True.
448
479
  show_progress: Whether to show a progress bar while downloading. Defaults to True.
480
+ max_concurrent_downloads: The maximum number of concurrent downloads. Defaults to 4.
449
481
 
450
482
  Returns:
451
483
  The path to the downloaded data directory.
452
484
  """
453
- return await self._download(datapoint, None, output_dir, verify, show_progress)
485
+ all_objects = await list_object_paths(self._store, _umbra_s3_prefix(datapoint))
486
+ return await self._download_objects(datapoint, all_objects, output_dir, show_progress, max_concurrent_downloads)
454
487
 
455
488
  async def download_objects(
456
489
  self,
457
490
  datapoint: xr.Dataset | UmbraStorageGranule,
458
491
  objects: list[str],
459
492
  output_dir: Path | None = None,
460
- verify: bool = True,
461
493
  show_progress: bool = True,
494
+ max_concurrent_downloads: int = 4,
462
495
  ) -> Path:
463
496
  """Download a subset of the data for a given datapoint.
464
497
 
@@ -470,80 +503,39 @@ class UmbraStorageClient(StorageClient):
470
503
  list_objects to get a list of available objects to filter on. Object names are considered relative
471
504
  to the granule location.
472
505
  output_dir: The directory to download the data to. Optional, defaults to the cache directory.
473
- verify: Whether to verify the md5sum of the downloaded file. Defaults to True.
474
506
  show_progress: Whether to show a progress bar while downloading. Defaults to True.
507
+ max_concurrent_downloads: The maximum number of concurrent downloads. Defaults to 4.
475
508
 
476
509
  Returns:
477
510
  The path to the downloaded data directory.
478
511
  """
479
- return await self._download(datapoint, lambda key: key in objects, output_dir, verify, show_progress)
512
+ return await self._download_objects(datapoint, objects, output_dir, show_progress, max_concurrent_downloads)
480
513
 
481
- async def _download(
514
+ async def _download_objects(
482
515
  self,
483
516
  datapoint: xr.Dataset | UmbraStorageGranule,
484
- obj_filter_func: Callable[[str], bool] | None = None,
517
+ objects: list[str],
485
518
  output_dir: Path | None = None,
486
- verify: bool = True,
487
519
  show_progress: bool = True,
520
+ max_concurrent_downloads: int = 4,
488
521
  ) -> Path:
489
- granule = UmbraStorageGranule.from_data(datapoint)
522
+ prefix = _umbra_s3_prefix(datapoint)
490
523
 
491
524
  base_folder = output_dir or self._cache
492
525
  if base_folder is None:
493
526
  raise ValueError("No cache directory or output directory provided.")
494
- output_folder = base_folder / self._STORAGE_PROVIDER / granule.location
495
-
496
- prefix = f"sar-data/tasks/{granule.location}/"
497
-
498
- objects = self._s3.list_objects(prefix)
499
- objects = [obj for obj in objects if "Key" in obj] # Key is optional, so just in case filter out obj without
500
-
501
- if obj_filter_func is not None:
502
- # get object names relative to the granule location, so we can pass it to our filter function
503
- object_names = [obj["Key"].removeprefix(prefix) for obj in objects if "Key" in obj]
504
- objects = [
505
- object_metadata
506
- for (object_metadata, object_name) in zip(objects, object_names, strict=True)
507
- if obj_filter_func(object_name)
508
- ]
527
+ output_folder = base_folder / self._STORAGE_PROVIDER / Path(prefix)
509
528
 
510
- async with anyio.create_task_group() as task_group:
511
- for object_metadata in objects:
512
- task_group.start_soon(
513
- self._download_object, object_metadata, prefix, output_folder, verify, show_progress
514
- )
529
+ if len(objects) == 0:
530
+ return output_folder
515
531
 
532
+ await download_objects(self._store, prefix, objects, output_folder, show_progress, max_concurrent_downloads)
516
533
  return output_folder
517
534
 
518
- async def _download_object(
519
- self, object_metadata: ObjectTypeDef, prefix: str, output_folder: Path, verify: bool, show_progress: bool
520
- ) -> None:
521
- key = object_metadata.get("Key", "")
522
- relative_path = key.removeprefix(prefix)
523
- if relative_path.removeprefix("/") == "": # skip the root folder if it shows up in the list for some reason
524
- return
525
- if object_metadata.get("Size", 0) == 0: # skip empty objects (they are just folder markers)
526
- return
527
535
 
528
- output_file = output_folder / relative_path
529
- if output_file.exists():
530
- return
531
-
532
- # we download into a temporary file, which we then move to the final location once the download is complete
533
- # this way we can be sure that the files in the download location are complete and not partially downloaded
534
- with tempfile.NamedTemporaryFile(prefix="tilebox", delete=False) as download_file:
535
- await self._s3.download_object(
536
- key,
537
- # as "name" for the progress bar we display the relative path to the root of the download
538
- relative_path,
539
- object_metadata.get("Size", 0),
540
- download_file,
541
- verify,
542
- show_progress,
543
- )
544
-
545
- output_folder.mkdir(parents=True, exist_ok=True)
546
- shutil.move(download_file.name, output_file)
536
+ def _copernicus_s3_prefix(datapoint: xr.Dataset | CopernicusStorageGranule) -> str:
537
+ granule = CopernicusStorageGranule.from_data(datapoint)
538
+ return granule.location.removeprefix("/eodata/")
547
539
 
548
540
 
549
541
  class CopernicusStorageClient(StorageClient):
@@ -588,22 +580,14 @@ class CopernicusStorageClient(StorageClient):
588
580
  f"To get access to the Copernicus data, please visit: https://documentation.dataspace.copernicus.eu/APIs/S3.html"
589
581
  )
590
582
 
591
- with warnings.catch_warnings():
592
- # https://github.com/boto/boto3/issues/3889
593
- warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*datetime.utcnow.*")
594
- boto3_client = boto3.client(
595
- "s3",
596
- aws_access_key_id=access_key,
597
- aws_secret_access_key=secret_access_key,
598
- endpoint_url=self._ENDPOINT_URL,
599
- )
600
-
601
- self._s3 = _S3Client(
602
- s3=boto3_client,
583
+ self._store = S3Store(
603
584
  bucket=self._BUCKET,
585
+ endpoint=self._ENDPOINT_URL,
586
+ access_key_id=access_key,
587
+ secret_access_key=secret_access_key,
604
588
  )
605
589
 
606
- def list_objects(self, datapoint: xr.Dataset | CopernicusStorageGranule) -> list[str]:
590
+ async def list_objects(self, datapoint: xr.Dataset | CopernicusStorageGranule) -> list[str]:
607
591
  """List all available objects for a given datapoint.
608
592
 
609
593
  Args:
@@ -611,38 +595,54 @@ class CopernicusStorageClient(StorageClient):
611
595
 
612
596
  Returns:
613
597
  List of object keys available for the given datapoint, relative to the granule location."""
598
+ return await self._list_objects(datapoint)
599
+
600
+ async def _list_objects(self, datapoint: xr.Dataset | CopernicusStorageGranule) -> list[str]:
601
+ """List all available objects for a given datapoint.
602
+
603
+ Args:
604
+ datapoint: The datapoint to list available objects the data for.
605
+
606
+ Returns:
607
+ List of object keys available for the given datapoint, relative to the granule location."""
608
+
614
609
  granule = CopernicusStorageGranule.from_data(datapoint)
615
- prefix = granule.location.removeprefix("/eodata/") + "/"
616
- keys = [object_metadata.get("Key") for object_metadata in self._s3.list_objects(prefix)]
617
- return [k.removeprefix(prefix) for k in keys if k is not None]
610
+ # special handling for Sentinel-5P, where the location is not a folder but a single file
611
+ if granule.location.endswith(".nc"):
612
+ return [Path(granule.granule_name).name]
613
+
614
+ return await list_object_paths(self._store, _copernicus_s3_prefix(granule))
618
615
 
619
616
  async def download(
620
617
  self,
621
618
  datapoint: xr.Dataset | CopernicusStorageGranule,
622
619
  output_dir: Path | None = None,
623
- verify: bool = True,
624
620
  show_progress: bool = True,
621
+ max_concurrent_downloads: int = 4,
625
622
  ) -> Path:
626
623
  """Download the data for a given datapoint.
627
624
 
628
625
  Args:
629
626
  datapoint: The datapoint to download the data for.
630
627
  output_dir: The directory to download the data to. Optional, defaults to the cache directory.
631
- verify: Whether to verify the md5sum of the downloaded file. Defaults to True.
632
628
  show_progress: Whether to show a progress bar while downloading. Defaults to True.
629
+ max_concurrent_downloads: The maximum number of concurrent downloads. Defaults to 4.
633
630
 
634
631
  Returns:
635
632
  The path to the downloaded data directory.
636
633
  """
637
- return await self._download(datapoint, None, output_dir, verify, show_progress)
634
+ granule = CopernicusStorageGranule.from_data(datapoint)
635
+
636
+ all_objects = await self._list_objects(granule)
637
+ return await self._download_objects(granule, all_objects, output_dir, show_progress, max_concurrent_downloads)
638
638
 
639
639
  async def download_objects(
640
640
  self,
641
641
  datapoint: xr.Dataset | CopernicusStorageGranule,
642
642
  objects: list[str],
643
643
  output_dir: Path | None = None,
644
- verify: bool = True,
645
644
  show_progress: bool = True,
645
+ max_concurrent_downloads: int = 4,
646
646
  ) -> Path:
647
647
  """Download a subset of the data for a given datapoint.
648
648
 
@@ -654,80 +654,249 @@ class CopernicusStorageClient(StorageClient):
654
654
  list_objects to get a list of available objects to filter on. Object names are considered relative
655
655
  to the granule location.
656
656
  output_dir: The directory to download the data to. Optional, defaults to the cache directory.
657
- verify: Whether to verify the md5sum of the downloaded file. Defaults to True.
658
657
  show_progress: Whether to show a progress bar while downloading. Defaults to True.
658
+ max_concurrent_downloads: The maximum number of concurrent downloads. Defaults to 4.
659
659
 
660
660
  Returns:
661
661
  The path to the downloaded data directory.
662
662
  """
663
- return await self._download(datapoint, lambda key: key in objects, output_dir, verify, show_progress)
663
+ return await self._download_objects(datapoint, objects, output_dir, show_progress, max_concurrent_downloads)
664
664
 
665
- async def _download(
665
+ async def _download_objects(
666
666
  self,
667
667
  datapoint: xr.Dataset | CopernicusStorageGranule,
668
- obj_filter_func: Callable[[str], bool] | None = None,
668
+ objects: list[str],
669
669
  output_dir: Path | None = None,
670
- verify: bool = True,
671
670
  show_progress: bool = True,
671
+ max_concurrent_downloads: int = 4,
672
672
  ) -> Path:
673
673
  granule = CopernicusStorageGranule.from_data(datapoint)
674
+ prefix = _copernicus_s3_prefix(granule)
675
+ single_file = False
676
+
677
+ # special handling for Sentinel-5P, where the location is not a folder but a single file
678
+ if granule.location.endswith(".nc"):
679
+ single_file = True
680
+ prefix = str(Path(prefix).parent)
674
681
 
675
682
  base_folder = output_dir or self._cache
676
683
  if base_folder is None:
677
684
  raise ValueError("No cache directory or output directory provided.")
678
- output_folder = base_folder / Path(granule.location.removeprefix("/eodata/"))
679
-
680
- prefix = granule.location.removeprefix("/eodata/") + "/"
681
- objects = self._s3.list_objects(prefix)
682
- objects = [obj for obj in objects if "Key" in obj] # Key is optional, so just in case filter out obj without
683
-
684
- if obj_filter_func is not None:
685
- # get object names relative to the granule location, so we can pass it to our filter function
686
- object_names = [obj["Key"].removeprefix(prefix) for obj in objects if "Key" in obj]
687
- objects = [
688
- object_metadata
689
- for (object_metadata, object_name) in zip(objects, object_names, strict=True)
690
- if obj_filter_func(object_name)
691
- ]
692
-
693
- async with anyio.create_task_group() as task_group:
694
- # even though this is a async task group, the downloads are still synchronous
695
- # because the S3 client is synchronous
696
- # we could work around this by using anyio.to_thread.run_sync
697
- # but then we download all files in parallel, which might be too much
698
- for object_metadata in objects:
699
- task_group.start_soon(
700
- self._download_object, object_metadata, prefix, output_folder, verify, show_progress
701
- )
685
+ output_folder = base_folder / self._STORAGE_PROVIDER / Path(prefix)
686
+
687
+ if len(objects) == 0:
688
+ return output_folder
702
689
 
690
+ await download_objects(self._store, prefix, objects, output_folder, show_progress, max_concurrent_downloads)
691
+ if single_file:
692
+ return output_folder / objects[0]
703
693
  return output_folder
704
694
 
705
- async def _download_object(
706
- self, object_metadata: ObjectTypeDef, prefix: str, output_folder: Path, verify: bool, show_progress: bool
695
+ async def download_quicklook(self, datapoint: xr.Dataset | CopernicusStorageGranule) -> Path:
696
+ """Download the quicklook image for a given datapoint.
697
+
698
+ Args:
699
+ datapoint: The datapoint to download the quicklook for.
700
+
701
+ Raises:
702
+ ValueError: If no quicklook is available for the given datapoint.
703
+
704
+ Returns:
705
+ The path to the downloaded quicklook image.
706
+ """
707
+ return await self._download_quicklook(datapoint)
708
+
709
+ async def quicklook(
710
+ self, datapoint: xr.Dataset | CopernicusStorageGranule, width: int = 600, height: int = 600
707
711
  ) -> None:
708
- key = object_metadata.get("Key", "")
709
- relative_path = key.removeprefix(prefix)
710
- if relative_path.removeprefix("/") == "": # skip the root folder if it shows up in the list for some reason
711
- return
712
- if object_metadata.get("Size", 0) == 0: # skip empty objects (they are just folder markers)
713
- return
712
+ """Display the quicklook image for a given datapoint.
714
713
 
715
- output_file = output_folder / relative_path
716
- if output_file.exists():
717
- return
714
+ Requires an IPython kernel to be running. If you are not using IPython, use download_quicklook instead.
718
715
 
719
- # we download into a temporary file, which we then move to the final location once the download is complete
720
- # this way we can be sure that the files in the download location are complete and not partially downloaded
721
- with tempfile.NamedTemporaryFile(prefix="tilebox", delete=False) as download_file:
722
- await self._s3.download_object(
723
- key,
724
- # as "name" for the progress bar we display the relative path to the root of the download
725
- relative_path,
726
- object_metadata.get("Size", 0),
727
- download_file,
728
- verify,
729
- show_progress,
730
- )
716
+ Args:
717
+ datapoint: The datapoint to download the quicklook for.
718
+ width: Display width of the image in pixels. Defaults to 600.
719
+ height: Display height of the image in pixels. Defaults to 600.
720
+
721
+ Raises:
722
+ ImportError: In case IPython is not available.
723
+ ValueError: If no quicklook is available for the given datapoint.
724
+ """
725
+ if Image is None:
726
+ raise ImportError("IPython is not available, please use download_preview instead.")
727
+ granule = CopernicusStorageGranule.from_data(datapoint)
728
+ quicklook = await self._download_quicklook(granule)
729
+ _display_quicklook(quicklook, width, height, f"<code>{granule.granule_name} © ESA {granule.time.year}</code>")
730
+
731
+ async def _download_quicklook(self, datapoint: xr.Dataset | CopernicusStorageGranule) -> Path:
732
+ granule = CopernicusStorageGranule.from_data(datapoint)
733
+ if granule.thumbnail is None:
734
+ raise ValueError(f"No quicklook available for {granule.granule_name}")
735
+
736
+ prefix = _copernicus_s3_prefix(granule)
737
+ output_folder = (
738
+ self._cache / self._STORAGE_PROVIDER / Path(prefix)
739
+ if self._cache is not None
740
+ else Path.cwd() / self._STORAGE_PROVIDER
741
+ )
742
+
743
+ await download_objects(self._store, prefix, [granule.thumbnail], output_folder, show_progress=False)
744
+ return output_folder / granule.thumbnail
745
+
746
+
747
+ def _landsat_s3_prefix(datapoint: xr.Dataset | USGSLandsatStorageGranule) -> str:
748
+ granule = USGSLandsatStorageGranule.from_data(datapoint)
749
+ return granule.location.removeprefix("s3://usgs-landsat/")
750
+
751
+
752
+ class USGSLandsatStorageClient(StorageClient):
753
+ """
754
+ A client for downloading USGS Landsat data from the usgs-landsat and usgs-landsat-ard S3 bucket.
755
+
756
+ This client handles the requester-pays nature of the bucket and provides methods for listing and downloading data.
757
+ """
758
+
759
+ _STORAGE_PROVIDER = "USGSLandsat"
760
+ _BUCKET = "usgs-landsat"
761
+ _REGION = "us-west-2"
762
+
763
+ def __init__(self, cache_directory: Path | None = Path.home() / ".cache" / "tilebox") -> None:
764
+ """A tilebox storage client that downloads data from the USGS Landsat S3 bucket.
765
+
766
+ Args:
767
+ cache_directory: The directory to store downloaded data in. Defaults to ~/.cache/tilebox. If set to None
768
+ no cache is used and the `output_dir` parameter will need be set when downloading data.
769
+ """
770
+ super().__init__(cache_directory)
771
+
772
+ self._store = S3Store(
773
+ self._BUCKET, region=self._REGION, request_payer=True, credential_provider=Boto3CredentialProvider()
774
+ )
775
+
776
+ async def list_objects(self, datapoint: xr.Dataset | USGSLandsatStorageGranule) -> list[str]:
777
+ """List all available objects for a given datapoint.
778
+
779
+ Args:
780
+ datapoint: The datapoint to list available objects the data for.
781
+
782
+ Returns:
783
+ List of object keys available for the given datapoint, relative to the granule location."""
784
+ return await list_object_paths(self._store, _landsat_s3_prefix(datapoint))
785
+
786
+ async def download(
787
+ self,
788
+ datapoint: xr.Dataset | USGSLandsatStorageGranule,
789
+ output_dir: Path | None = None,
790
+ show_progress: bool = True,
791
+ max_concurrent_downloads: int = 4,
792
+ ) -> Path:
793
+ """Download the data for a given datapoint.
794
+
795
+ Args:
796
+ datapoint: The datapoint to download the data for.
797
+ output_dir: The directory to download the data to. Optional, defaults to the cache directory.
798
+ show_progress: Whether to show a progress bar while downloading. Defaults to True.
799
+ max_concurrent_downloads: The maximum number of concurrent downloads. Defaults to 4.
800
+
801
+ Returns:
802
+ The path to the downloaded data directory.
803
+ """
804
+ all_objects = await list_object_paths(self._store, _landsat_s3_prefix(datapoint))
805
+ return await self._download_objects(datapoint, all_objects, output_dir, show_progress, max_concurrent_downloads)
806
+
807
+ async def download_objects(
808
+ self,
809
+ datapoint: xr.Dataset | USGSLandsatStorageGranule,
810
+ objects: list[str],
811
+ output_dir: Path | None = None,
812
+ show_progress: bool = True,
813
+ max_concurrent_downloads: int = 4,
814
+ ) -> Path:
815
+ """Download a subset of the data for a given datapoint.
816
+
817
+ Typically used in conjunction with list_objects to filter the available objects beforehand.
818
+
819
+ Args:
820
+ datapoint: The datapoint to download the data for.
821
+ objects: A list of objects to download. Only objects that are in this list will be downloaded. See
822
+ list_objects to get a list of available objects to filter on. Object names are considered relative
823
+ to the granule location.
824
+ output_dir: The directory to download the data to. Optional, defaults to the cache directory.
825
+ show_progress: Whether to show a progress bar while downloading. Defaults to True.
826
+ max_concurrent_downloads: The maximum number of concurrent downloads. Defaults to 4.
827
+
828
+ Returns:
829
+ The path to the downloaded data directory.
830
+ """
831
+ return await self._download_objects(datapoint, objects, output_dir, show_progress, max_concurrent_downloads)
832
+
833
+ async def _download_objects(
834
+ self,
835
+ datapoint: xr.Dataset | USGSLandsatStorageGranule,
836
+ objects: list[str],
837
+ output_dir: Path | None = None,
838
+ show_progress: bool = True,
839
+ max_concurrent_downloads: int = 4,
840
+ ) -> Path:
841
+ prefix = _landsat_s3_prefix(datapoint)
842
+
843
+ base_folder = output_dir or self._cache
844
+ if base_folder is None:
845
+ raise ValueError("No cache directory or output directory provided.")
846
+ output_folder = base_folder / Path(prefix)
847
+
848
+ if len(objects) == 0:
849
+ return output_folder
850
+
851
+ await download_objects(self._store, prefix, objects, output_folder, show_progress, max_concurrent_downloads)
852
+ return output_folder
853
+
854
+ async def download_quicklook(self, datapoint: xr.Dataset | USGSLandsatStorageGranule) -> Path:
855
+ """Download the quicklook image for a given datapoint.
856
+
857
+ Args:
858
+ datapoint: The datapoint to download the quicklook for.
859
+
860
+ Raises:
861
+ ValueError: If no quicklook is available for the given datapoint.
862
+
863
+ Returns:
864
+ The path to the downloaded quicklook image.
865
+ """
866
+ return await self._download_quicklook(datapoint)
867
+
868
+ async def quicklook(
869
+ self, datapoint: xr.Dataset | USGSLandsatStorageGranule, width: int = 600, height: int = 600
870
+ ) -> None:
871
+ """Display the quicklook image for a given datapoint.
872
+
873
+ Requires an IPython kernel to be running. If you are not using IPython, use download_quicklook instead.
874
+
875
+ Args:
876
+ datapoint: The datapoint to download the quicklook for.
877
+ width: Display width of the image in pixels. Defaults to 600.
878
+ height: Display height of the image in pixels. Defaults to 600.
879
+
880
+ Raises:
881
+ ImportError: In case IPython is not available.
882
+ ValueError: If no quicklook is available for the given datapoint.
883
+ """
884
+ if Image is None:
885
+ raise ImportError("IPython is not available, please use download_preview instead.")
886
+ quicklook = await self._download_quicklook(datapoint)
887
+ _display_quicklook(quicklook, width, height, f"<code>Image {quicklook.name} © USGS</code>")
888
+
889
+ async def _download_quicklook(self, datapoint: xr.Dataset | USGSLandsatStorageGranule) -> Path:
890
+ granule = USGSLandsatStorageGranule.from_data(datapoint)
891
+ if granule.thumbnail is None:
892
+ raise ValueError(f"No quicklook available for {granule.granule_name}")
893
+
894
+ prefix = _landsat_s3_prefix(datapoint)
895
+ output_folder = (
896
+ self._cache / self._STORAGE_PROVIDER / Path(prefix)
897
+ if self._cache is not None
898
+ else Path.cwd() / self._STORAGE_PROVIDER
899
+ )
731
900
 
732
- output_file.parent.mkdir(parents=True, exist_ok=True)
733
- shutil.move(download_file.name, output_file)
901
+ await download_objects(self._store, prefix, [granule.thumbnail], output_folder, show_progress=False)
902
+ return output_folder / granule.thumbnail
@@ -1,5 +1,6 @@
1
1
  from dataclasses import dataclass
2
2
  from datetime import datetime
3
+ from pathlib import Path
3
4
 
4
5
  import xarray as xr
5
6
 
@@ -64,7 +65,6 @@ def _asf_download_urls(granule_name: str) -> StorageURLs:
64
65
  class UmbraStorageGranule:
65
66
  time: datetime
66
67
  granule_name: str
67
- processing_level: str
68
68
  location: str
69
69
 
70
70
  @classmethod
@@ -84,17 +84,34 @@ class UmbraStorageGranule:
84
84
  return cls(
85
85
  time,
86
86
  dataset.granule_name.item(),
87
- dataset.processing_level.item(),
88
87
  dataset.location.item(),
89
88
  )
90
89
 
91
90
 
91
+ def _thumbnail_relative_to_eodata_location(thumbnail_url: str, location: str) -> str:
92
+ """
93
+ Returns a thumbnail path from a URL as a path relative to a storage location.
94
+
95
+ For example:
96
+ >>> _thumbnail_relative_to_location(
97
+ >>> "https://catalogue.dataspace.copernicus.eu/get-object?path=/Sentinel-1/SAR/EW_GRDM_1S/2025/08/07/S1A_EW_GRDM_1SDH_20250807T111242_20250807T111346_060429_078305_DB6A.SAFE/preview/thumbnail.png",
98
+ >>> "/eodata/Sentinel-1/SAR/EW_GRDM_1S/2025/08/07/S1A_EW_GRDM_1SDH_20250807T111242_20250807T111346_060429_078305_DB6A.SAFE"
99
+ >>> )
100
+ "preview/thumbnail.png"
101
+ """
102
+
103
+ url_path = thumbnail_url.split("?path=")[-1]
104
+ url_path = url_path.removeprefix("/")
105
+ location = location.removeprefix("/eodata/")
106
+ return str(Path(url_path).relative_to(location))
107
+
108
+
92
109
  @dataclass
93
110
  class CopernicusStorageGranule:
94
111
  time: datetime
95
112
  granule_name: str
96
113
  location: str
97
- file_size: int
114
+ thumbnail: str | None = None
98
115
 
99
116
  @classmethod
100
117
  def from_data(cls, dataset: "xr.Dataset | CopernicusStorageGranule") -> "CopernicusStorageGranule":
@@ -110,9 +127,59 @@ class CopernicusStorageGranule:
110
127
 
111
128
  time = datetime.combine(dataset.time.dt.date.item(), dataset.time.dt.time.item())
112
129
 
130
+ location = dataset.location.item()
131
+
132
+ thumbnail_path = None
133
+ if "thumbnail" in dataset:
134
+ thumbnail_path = dataset.thumbnail.item().strip()
135
+
136
+ thumbnail = (
137
+ _thumbnail_relative_to_eodata_location(thumbnail_path, location)
138
+ if isinstance(thumbnail_path, str) and len(thumbnail_path) > 0
139
+ else None
140
+ )
141
+
113
142
  return cls(
114
143
  time,
115
144
  dataset.granule_name.item(),
116
- dataset.location.item(),
117
- dataset.file_size.item(),
145
+ location,
146
+ thumbnail,
147
+ )
148
+
149
+
150
+ @dataclass
151
+ class USGSLandsatStorageGranule:
152
+ time: datetime
153
+ granule_name: str
154
+ location: str
155
+ thumbnail: str | None = None
156
+
157
+ @classmethod
158
+ def from_data(cls, dataset: "xr.Dataset | USGSLandsatStorageGranule") -> "USGSLandsatStorageGranule":
159
+ """Extract the granule information from a datapoint given as xarray dataset."""
160
+ if isinstance(dataset, USGSLandsatStorageGranule):
161
+ return dataset
162
+
163
+ if "time" in dataset.dims:
164
+ if dataset.sizes["time"] == 1:
165
+ dataset = dataset.isel(time=0)
166
+ else:
167
+ raise ValueError("The given dataset has more than one granule.")
168
+
169
+ time = datetime.combine(dataset.time.dt.date.item(), dataset.time.dt.time.item())
170
+
171
+ thumbnail_path: str | None = None
172
+ if "thumbnail" in dataset:
173
+ thumbnail_path = dataset.thumbnail.item()
174
+ elif "overview" in dataset:
175
+ thumbnail_path = dataset.overview.item()
176
+
177
+ thumbnail = thumbnail_path.split("/")[-1] if isinstance(thumbnail_path, str) else None
178
+
179
+ return cls(
180
+ time,
181
+ dataset.granule_name.item(),
182
+ # Landsat 2 STAC items have an incorrect bucket name set, it should be usgs-landsat as well
183
+ dataset.location.item().replace("s3://usgs-landsat-ard/", "s3://usgs-landsat/"),
184
+ thumbnail,
118
185
  )
@@ -60,6 +60,8 @@ async def _asf_login(auth: tuple[str, str]) -> AsyncClient:
60
60
  "redirect_uri": "https://auth.asf.alaska.edu/login",
61
61
  },
62
62
  )
63
+ await response.aclose()
63
64
  if response.status_code == 401:
65
+ await client.aclose()
64
66
  raise ValueError("Invalid username or password.")
65
67
  return client