cubexpress 0.1.18__tar.gz → 0.1.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cubexpress-0.1.18 → cubexpress-0.1.21}/PKG-INFO +5 -4
- cubexpress-0.1.21/cubexpress/__init__.py +34 -0
- cubexpress-0.1.21/cubexpress/cache.py +76 -0
- {cubexpress-0.1.18 → cubexpress-0.1.21}/cubexpress/cloud_utils.py +7 -4
- cubexpress-0.1.21/cubexpress/conversion.py +147 -0
- cubexpress-0.1.21/cubexpress/cube.py +209 -0
- cubexpress-0.1.21/cubexpress/downloader.py +115 -0
- cubexpress-0.1.21/cubexpress/geospatial.py +207 -0
- cubexpress-0.1.21/cubexpress/geotyping.py +306 -0
- {cubexpress-0.1.18 → cubexpress-0.1.21}/cubexpress/request.py +21 -29
- cubexpress-0.1.21/pyproject.toml +151 -0
- cubexpress-0.1.18/cubexpress/__init__.py +0 -19
- cubexpress-0.1.18/cubexpress/cache.py +0 -52
- cubexpress-0.1.18/cubexpress/conversion.py +0 -156
- cubexpress-0.1.18/cubexpress/cube.py +0 -240
- cubexpress-0.1.18/cubexpress/downloader.py +0 -97
- cubexpress-0.1.18/cubexpress/geospatial.py +0 -195
- cubexpress-0.1.18/cubexpress/geotyping.py +0 -398
- cubexpress-0.1.18/pyproject.toml +0 -100
- {cubexpress-0.1.18 → cubexpress-0.1.21}/LICENSE +0 -0
- {cubexpress-0.1.18 → cubexpress-0.1.21}/README.md +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cubexpress
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.21
|
|
4
4
|
Summary: Efficient processing of cubic Earth-observation (EO) data.
|
|
5
5
|
Home-page: https://github.com/andesdatacube/cubexpress
|
|
6
6
|
Keywords: earth-engine,sentinel-2,geospatial,eo,cube
|
|
@@ -16,13 +16,14 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
16
16
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: GIS
|
|
18
18
|
Requires-Dist: earthengine-api (>=1.5.12)
|
|
19
|
-
Requires-Dist: numpy (>=2.0
|
|
20
|
-
Requires-Dist: pandas (>=2.
|
|
19
|
+
Requires-Dist: numpy (>=1.22.4,<2.0)
|
|
20
|
+
Requires-Dist: pandas (>=2.0.0)
|
|
21
21
|
Requires-Dist: pyarrow (>=14.0.0)
|
|
22
|
-
Requires-Dist: pydantic (>=2.
|
|
22
|
+
Requires-Dist: pydantic (>=2.0.0)
|
|
23
23
|
Requires-Dist: pygeohash (>=1.2.0)
|
|
24
24
|
Requires-Dist: pyproj (>=3.6.0)
|
|
25
25
|
Requires-Dist: rasterio (>=1.3.9)
|
|
26
|
+
Requires-Dist: tqdm (>=4.65.0)
|
|
26
27
|
Requires-Dist: utm (>=0.7.0)
|
|
27
28
|
Project-URL: Documentation, https://andesdatacube.github.io/cubexpress
|
|
28
29
|
Project-URL: Repository, https://github.com/andesdatacube/cubexpress
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CubExpress - Efficient Earth Engine data download and processing.
|
|
3
|
+
|
|
4
|
+
Main components:
|
|
5
|
+
- lonlat2rt: Convert coordinates to raster transforms
|
|
6
|
+
- s2_table: Query Sentinel-2 metadata with cloud scores
|
|
7
|
+
- table_to_requestset: Build request sets from metadata
|
|
8
|
+
- get_cube: Download Earth Engine data cubes
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from cubexpress.cloud_utils import s2_table
|
|
14
|
+
from cubexpress.conversion import geo2utm, lonlat2rt
|
|
15
|
+
from cubexpress.cube import get_cube
|
|
16
|
+
from cubexpress.geotyping import RasterTransform, Request, RequestSet
|
|
17
|
+
from cubexpress.request import table_to_requestset
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"lonlat2rt",
|
|
21
|
+
"geo2utm",
|
|
22
|
+
"RasterTransform",
|
|
23
|
+
"Request",
|
|
24
|
+
"RequestSet",
|
|
25
|
+
"s2_table",
|
|
26
|
+
"table_to_requestset",
|
|
27
|
+
"get_cube",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
from importlib.metadata import version
|
|
32
|
+
__version__ = version("cubexpress")
|
|
33
|
+
except Exception:
|
|
34
|
+
__version__ = "0.0.0-dev"
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Caching utilities for Earth Engine query results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import pathlib
|
|
8
|
+
|
|
9
|
+
from cubexpress.config import CACHE_DIR
|
|
10
|
+
|
|
11
|
+
CACHE_DIR.mkdir(exist_ok=True, parents=True)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _cache_key(
|
|
15
|
+
lon: float,
|
|
16
|
+
lat: float,
|
|
17
|
+
edge_size: int | tuple[int, int],
|
|
18
|
+
scale: int,
|
|
19
|
+
collection: str,
|
|
20
|
+
) -> pathlib.Path:
|
|
21
|
+
"""
|
|
22
|
+
Generate a deterministic cache file path for query parameters.
|
|
23
|
+
|
|
24
|
+
Coordinates are rounded to 4 decimal places (~11m precision) to
|
|
25
|
+
ensure cache hits for equivalent locations.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
lon: Longitude of center point
|
|
29
|
+
lat: Latitude of center point
|
|
30
|
+
edge_size: ROI size in pixels
|
|
31
|
+
scale: Pixel resolution in meters
|
|
32
|
+
collection: Earth Engine collection ID
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Path to hashed .parquet cache file
|
|
36
|
+
"""
|
|
37
|
+
lon_r = round(lon, 4)
|
|
38
|
+
lat_r = round(lat, 4)
|
|
39
|
+
|
|
40
|
+
edge_tuple = (
|
|
41
|
+
(edge_size, edge_size) if isinstance(edge_size, int)
|
|
42
|
+
else tuple(edge_size)
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
signature = [lon_r, lat_r, edge_tuple, scale, collection]
|
|
46
|
+
|
|
47
|
+
raw = json.dumps(signature, sort_keys=True).encode("utf-8")
|
|
48
|
+
digest = hashlib.md5(raw).hexdigest()
|
|
49
|
+
|
|
50
|
+
return CACHE_DIR / f"{digest}.parquet"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def clear_cache() -> int:
|
|
54
|
+
"""
|
|
55
|
+
Remove all cached query results.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Number of files deleted
|
|
59
|
+
"""
|
|
60
|
+
count = 0
|
|
61
|
+
for cache_file in CACHE_DIR.glob("*.parquet"):
|
|
62
|
+
cache_file.unlink()
|
|
63
|
+
count += 1
|
|
64
|
+
return count
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_cache_size() -> tuple[int, int]:
|
|
68
|
+
"""
|
|
69
|
+
Calculate total cache size.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Tuple of (file_count, total_bytes)
|
|
73
|
+
"""
|
|
74
|
+
files = list(CACHE_DIR.glob("*.parquet"))
|
|
75
|
+
total_bytes = sum(f.stat().st_size for f in files)
|
|
76
|
+
return len(files), total_bytes
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import datetime as dt
|
|
4
|
-
import sys
|
|
5
4
|
import time
|
|
5
|
+
import warnings
|
|
6
|
+
|
|
6
7
|
import ee
|
|
7
8
|
import pandas as pd
|
|
9
|
+
|
|
8
10
|
from cubexpress.cache import _cache_key
|
|
9
|
-
import datetime as dt
|
|
10
11
|
from cubexpress.geospatial import _square_roi
|
|
11
|
-
|
|
12
|
+
|
|
12
13
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
|
13
14
|
|
|
14
15
|
|
|
@@ -241,7 +242,9 @@ def s2_table(
|
|
|
241
242
|
elapsed = time.time() - t0
|
|
242
243
|
n_images = len(df_full)
|
|
243
244
|
date_range = f"{start} to {end}"
|
|
244
|
-
|
|
245
|
+
actual_start = df_full['date'].min()
|
|
246
|
+
actual_end = df_full['date'].max()
|
|
247
|
+
print(f"\r✅ Retrieved {n_images} images from {actual_start} to {actual_end} ({elapsed:.2f}s)")
|
|
245
248
|
|
|
246
249
|
# Save cache
|
|
247
250
|
if cache:
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Coordinate conversion and raster transform utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import utm
|
|
6
|
+
from pyproj import CRS, Transformer
|
|
7
|
+
|
|
8
|
+
from cubexpress.exceptions import ValidationError
|
|
9
|
+
from cubexpress.geotyping import RasterTransform
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def parse_edge_size(edge_size: int | tuple[int, int]) -> tuple[int, int]:
|
|
13
|
+
"""
|
|
14
|
+
Parse edge_size input into (width, height) tuple.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
edge_size: Size specification (int for square, tuple for rectangle)
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Tuple of (width, height) in pixels
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
ValidationError: If input is invalid
|
|
24
|
+
"""
|
|
25
|
+
if isinstance(edge_size, int):
|
|
26
|
+
if edge_size <= 0:
|
|
27
|
+
raise ValidationError(f"edge_size must be positive, got {edge_size}")
|
|
28
|
+
return (edge_size, edge_size)
|
|
29
|
+
|
|
30
|
+
if len(edge_size) != 2:
|
|
31
|
+
raise ValidationError(
|
|
32
|
+
f"edge_size tuple must have 2 elements, got {len(edge_size)}"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
width, height = edge_size
|
|
36
|
+
if width <= 0 or height <= 0:
|
|
37
|
+
raise ValidationError(
|
|
38
|
+
f"edge_size values must be positive, got {edge_size}"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
return (width, height)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def geo2utm(lon: float, lat: float) -> tuple[float, float, str]:
|
|
45
|
+
"""
|
|
46
|
+
Convert lat/lon to UTM coordinates and EPSG code.
|
|
47
|
+
|
|
48
|
+
Uses the utm library for standard conversion.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
lon: Longitude in decimal degrees
|
|
52
|
+
lat: Latitude in decimal degrees
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Tuple of (x, y, epsg_code) where EPSG code is formatted as 'EPSG:XXXXX'
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
utm.OutOfRangeError: If coordinates are outside valid UTM range
|
|
59
|
+
"""
|
|
60
|
+
x, y, zone, _ = utm.from_latlon(lat, lon)
|
|
61
|
+
epsg_code = f"326{zone:02d}" if lat >= 0 else f"327{zone:02d}"
|
|
62
|
+
return float(x), float(y), f"EPSG:{epsg_code}"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def lonlat2rt_utm_or_ups(lon: float, lat: float) -> tuple[float, float, str]:
|
|
66
|
+
"""
|
|
67
|
+
Calculate UTM coordinates using pyproj (fallback for geo2utm).
|
|
68
|
+
|
|
69
|
+
This method is more robust than the utm library and works globally,
|
|
70
|
+
including near the poles. Uses standard UTM zones for all latitudes
|
|
71
|
+
to match Google Earth Engine behavior.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
lon: Longitude in decimal degrees
|
|
75
|
+
lat: Latitude in decimal degrees
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Tuple of (x, y, epsg_code)
|
|
79
|
+
"""
|
|
80
|
+
zone = int((lon + 180) // 6) + 1
|
|
81
|
+
epsg_code = 32600 + zone if lat >= 0 else 32700 + zone
|
|
82
|
+
crs = CRS.from_epsg(epsg_code)
|
|
83
|
+
|
|
84
|
+
transformer = Transformer.from_crs(4326, crs, always_xy=True)
|
|
85
|
+
x, y = transformer.transform(lon, lat)
|
|
86
|
+
|
|
87
|
+
return float(x), float(y), f"EPSG:{epsg_code}"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def lonlat2rt(
|
|
91
|
+
lon: float,
|
|
92
|
+
lat: float,
|
|
93
|
+
edge_size: int | tuple[int, int],
|
|
94
|
+
scale: int
|
|
95
|
+
) -> RasterTransform:
|
|
96
|
+
"""
|
|
97
|
+
Generate a RasterTransform from geographic coordinates.
|
|
98
|
+
|
|
99
|
+
Converts (lon, lat) to UTM projection and builds geospatial metadata
|
|
100
|
+
including affine transformation parameters. The Y-scale is negative
|
|
101
|
+
because raster images have their origin at the top-left corner.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
lon: Longitude in decimal degrees
|
|
105
|
+
lat: Latitude in decimal degrees
|
|
106
|
+
edge_size: Output raster size
|
|
107
|
+
- int: creates square (width=height=edge_size)
|
|
108
|
+
- tuple: specifies (width, height) in pixels
|
|
109
|
+
scale: Spatial resolution in meters per pixel
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
RasterTransform with CRS, geotransform, and dimensions
|
|
113
|
+
|
|
114
|
+
Examples:
|
|
115
|
+
>>> rt = lonlat2rt(lon=-76.0, lat=40.0, edge_size=512, scale=30)
|
|
116
|
+
>>> print(rt.width, rt.height)
|
|
117
|
+
512 512
|
|
118
|
+
|
|
119
|
+
>>> rt = lonlat2rt(lon=-76.0, lat=40.0, edge_size=(1024, 512), scale=30)
|
|
120
|
+
>>> print(rt.width, rt.height)
|
|
121
|
+
1024 512
|
|
122
|
+
"""
|
|
123
|
+
try:
|
|
124
|
+
x, y, crs = geo2utm(lon, lat)
|
|
125
|
+
except Exception:
|
|
126
|
+
x, y, crs = lonlat2rt_utm_or_ups(lon, lat)
|
|
127
|
+
|
|
128
|
+
width, height = parse_edge_size(edge_size)
|
|
129
|
+
|
|
130
|
+
half_width_m = (width * scale) / 2
|
|
131
|
+
half_height_m = (height * scale) / 2
|
|
132
|
+
|
|
133
|
+
geotransform = {
|
|
134
|
+
"scaleX": scale,
|
|
135
|
+
"shearX": 0,
|
|
136
|
+
"translateX": x - half_width_m,
|
|
137
|
+
"scaleY": -scale,
|
|
138
|
+
"shearY": 0,
|
|
139
|
+
"translateY": y + half_height_m,
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return RasterTransform(
|
|
143
|
+
crs=crs,
|
|
144
|
+
geotransform=geotransform,
|
|
145
|
+
width=width,
|
|
146
|
+
height=height
|
|
147
|
+
)
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import pathlib
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from copy import deepcopy
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import ee
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
from cubexpress.downloader import download_manifest, download_manifests
|
|
14
|
+
from cubexpress.geospatial import calculate_cell_size, quadsplit_manifest
|
|
15
|
+
from cubexpress.geotyping import RequestSet
|
|
16
|
+
from cubexpress.logging_config import setup_logger
|
|
17
|
+
|
|
18
|
+
logger = setup_logger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _test_manifest_tiling(manifest: dict[str, Any]) -> int:
|
|
22
|
+
"""
|
|
23
|
+
Test if a manifest requires tiling without downloading data.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
manifest: Earth Engine download manifest
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Number of tiles required (1 if no tiling needed)
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
if "assetId" in manifest:
|
|
33
|
+
_ = ee.data.getPixels(manifest)
|
|
34
|
+
elif "expression" in manifest:
|
|
35
|
+
ee_image = ee.deserializer.decode(json.loads(manifest["expression"]))
|
|
36
|
+
manifest_copy = deepcopy(manifest)
|
|
37
|
+
manifest_copy["expression"] = ee_image
|
|
38
|
+
_ = ee.data.computePixels(manifest_copy)
|
|
39
|
+
|
|
40
|
+
return 1
|
|
41
|
+
|
|
42
|
+
except ee.ee_exception.EEException as err:
|
|
43
|
+
size = manifest["grid"]["dimensions"]["width"]
|
|
44
|
+
cell_w, cell_h, power = calculate_cell_size(str(err), size)
|
|
45
|
+
n_tiles = (2 ** power) ** 2
|
|
46
|
+
return n_tiles
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_geotiff(
|
|
50
|
+
manifest: dict[str, Any],
|
|
51
|
+
full_outname: pathlib.Path | str,
|
|
52
|
+
nworks: int,
|
|
53
|
+
return_tile_info: bool = False,
|
|
54
|
+
) -> int | None:
|
|
55
|
+
"""
|
|
56
|
+
Download a single GeoTIFF with automatic tiling if needed.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
manifest: Earth Engine download manifest
|
|
60
|
+
full_outname: Output path for final GeoTIFF
|
|
61
|
+
nworks: Number of worker threads for tiling
|
|
62
|
+
return_tile_info: If True, return number of tiles created
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Number of tiles if return_tile_info=True, otherwise None
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
download_manifest(ulist=manifest, full_outname=full_outname)
|
|
69
|
+
return 1 if return_tile_info else None
|
|
70
|
+
|
|
71
|
+
except ee.ee_exception.EEException as err:
|
|
72
|
+
size = manifest["grid"]["dimensions"]["width"]
|
|
73
|
+
cell_w, cell_h, power = calculate_cell_size(str(err), size)
|
|
74
|
+
|
|
75
|
+
tiled = quadsplit_manifest(manifest, cell_w, cell_h, power)
|
|
76
|
+
n_tiles = len(tiled)
|
|
77
|
+
|
|
78
|
+
# Silent tiling - no log spam
|
|
79
|
+
download_manifests(
|
|
80
|
+
manifests=tiled,
|
|
81
|
+
full_outname=full_outname,
|
|
82
|
+
max_workers=nworks
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return n_tiles if return_tile_info else None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _detect_optimal_workers(
|
|
89
|
+
first_manifest: dict[str, Any],
|
|
90
|
+
total_workers: int
|
|
91
|
+
) -> tuple[int, int]:
|
|
92
|
+
"""
|
|
93
|
+
Detect optimal worker distribution by testing first image.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
first_manifest: Manifest of first image
|
|
97
|
+
total_workers: Total workers to distribute
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Tuple of (outer_workers, inner_workers)
|
|
101
|
+
"""
|
|
102
|
+
n_tiles = _test_manifest_tiling(first_manifest)
|
|
103
|
+
|
|
104
|
+
if n_tiles == 1:
|
|
105
|
+
outer, inner = total_workers, 1
|
|
106
|
+
logger.debug(f"No tiling needed - using {outer} parallel images")
|
|
107
|
+
else:
|
|
108
|
+
inner = min(n_tiles, max(1, total_workers // 2))
|
|
109
|
+
outer = max(1, total_workers // inner)
|
|
110
|
+
logger.info(
|
|
111
|
+
f"Auto-detected tiling required ({n_tiles} tiles/image) - "
|
|
112
|
+
f"using outer={outer}, inner={inner}"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return outer, inner
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def get_cube(
|
|
119
|
+
requests: pd.DataFrame | RequestSet,
|
|
120
|
+
outfolder: pathlib.Path | str,
|
|
121
|
+
nworks: int | tuple[int, int] = 4,
|
|
122
|
+
auto_workers: bool = True
|
|
123
|
+
) -> None:
|
|
124
|
+
"""
|
|
125
|
+
Download a set of Earth Engine requests in parallel.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
requests: Collection of requests (DataFrame or RequestSet)
|
|
129
|
+
outfolder: Destination directory
|
|
130
|
+
nworks: Worker configuration (int or tuple of (outer, inner))
|
|
131
|
+
auto_workers: If True, automatically detect optimal distribution
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
ValueError: If nworks configuration is invalid
|
|
135
|
+
TypeError: If nworks has wrong type
|
|
136
|
+
"""
|
|
137
|
+
outfolder = pathlib.Path(outfolder).expanduser().resolve()
|
|
138
|
+
outfolder.mkdir(parents=True, exist_ok=True)
|
|
139
|
+
|
|
140
|
+
dataframe = (
|
|
141
|
+
requests._dataframe if isinstance(requests, RequestSet)
|
|
142
|
+
else requests
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if dataframe.empty:
|
|
146
|
+
logger.warning("Request set is empty")
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
# Determine worker configuration
|
|
150
|
+
if isinstance(nworks, int):
|
|
151
|
+
if nworks <= 0:
|
|
152
|
+
raise ValueError(f"nworks must be positive, got {nworks}")
|
|
153
|
+
|
|
154
|
+
if auto_workers:
|
|
155
|
+
first_row = dataframe.iloc[0]
|
|
156
|
+
nworks_outer, nworks_inner = _detect_optimal_workers(
|
|
157
|
+
first_manifest=first_row.manifest,
|
|
158
|
+
total_workers=nworks
|
|
159
|
+
)
|
|
160
|
+
else:
|
|
161
|
+
nworks_outer, nworks_inner = nworks, 1
|
|
162
|
+
|
|
163
|
+
elif isinstance(nworks, (list, tuple)):
|
|
164
|
+
if len(nworks) != 2:
|
|
165
|
+
raise ValueError(f"nworks tuple must have 2 elements, got {len(nworks)}")
|
|
166
|
+
|
|
167
|
+
nworks_outer, nworks_inner = nworks
|
|
168
|
+
|
|
169
|
+
if not all(isinstance(n, int) for n in (nworks_outer, nworks_inner)):
|
|
170
|
+
raise TypeError(f"nworks elements must be integers")
|
|
171
|
+
|
|
172
|
+
if nworks_outer <= 0 or nworks_inner <= 0:
|
|
173
|
+
raise ValueError(f"nworks values must be positive")
|
|
174
|
+
else:
|
|
175
|
+
raise TypeError(f"nworks must be int or tuple, got {type(nworks)}")
|
|
176
|
+
|
|
177
|
+
# Execute downloads
|
|
178
|
+
failed = []
|
|
179
|
+
with ThreadPoolExecutor(max_workers=nworks_outer) as executor:
|
|
180
|
+
futures = {
|
|
181
|
+
executor.submit(
|
|
182
|
+
get_geotiff,
|
|
183
|
+
manifest=row.manifest,
|
|
184
|
+
full_outname=outfolder / f"{row.id}.tif",
|
|
185
|
+
nworks=nworks_inner,
|
|
186
|
+
return_tile_info=False
|
|
187
|
+
): row.id
|
|
188
|
+
for _, row in dataframe.iterrows()
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
for future in tqdm(
|
|
192
|
+
as_completed(futures),
|
|
193
|
+
total=len(futures),
|
|
194
|
+
desc=f"Downloading (outer={nworks_outer}, inner={nworks_inner})",
|
|
195
|
+
unit="image",
|
|
196
|
+
leave=True
|
|
197
|
+
):
|
|
198
|
+
img_id = futures[future]
|
|
199
|
+
try:
|
|
200
|
+
future.result()
|
|
201
|
+
except Exception as exc:
|
|
202
|
+
logger.error(f"Failed {img_id}: {exc}")
|
|
203
|
+
failed.append(img_id)
|
|
204
|
+
|
|
205
|
+
# Summary
|
|
206
|
+
if failed:
|
|
207
|
+
logger.warning(f"{len(failed)}/{len(dataframe)} downloads failed")
|
|
208
|
+
else:
|
|
209
|
+
logger.info(f"✓ Downloaded {len(dataframe)} images to {outfolder}")
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import pathlib
|
|
5
|
+
import shutil
|
|
6
|
+
import tempfile
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
8
|
+
from contextlib import contextmanager
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
from typing import Any, Iterator
|
|
11
|
+
|
|
12
|
+
import ee
|
|
13
|
+
|
|
14
|
+
from cubexpress.geospatial import merge_tifs
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@contextmanager
|
|
18
|
+
def temp_workspace(prefix: str = "cubexpress_") -> Iterator[pathlib.Path]:
|
|
19
|
+
"""
|
|
20
|
+
Create a temporary directory with automatic cleanup.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
prefix: Prefix for the temporary directory name
|
|
24
|
+
|
|
25
|
+
Yields:
|
|
26
|
+
Path to temporary directory
|
|
27
|
+
"""
|
|
28
|
+
tmp_dir = pathlib.Path(tempfile.mkdtemp(prefix=prefix))
|
|
29
|
+
try:
|
|
30
|
+
yield tmp_dir
|
|
31
|
+
finally:
|
|
32
|
+
if tmp_dir.exists():
|
|
33
|
+
shutil.rmtree(tmp_dir, ignore_errors=True)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def download_manifest(
|
|
37
|
+
ulist: dict[str, Any],
|
|
38
|
+
full_outname: pathlib.Path
|
|
39
|
+
) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Download data from Earth Engine based on a manifest dictionary.
|
|
42
|
+
|
|
43
|
+
Handles both direct asset IDs and serialized EE expressions.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
ulist: Export manifest containing 'assetId' or 'expression'
|
|
47
|
+
full_outname: Destination path for the downloaded file
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
ValueError: If manifest is invalid
|
|
51
|
+
ee.ee_exception.EEException: If Earth Engine request fails
|
|
52
|
+
"""
|
|
53
|
+
if "assetId" in ulist:
|
|
54
|
+
images_bytes = ee.data.getPixels(ulist)
|
|
55
|
+
elif "expression" in ulist:
|
|
56
|
+
ee_image = ee.deserializer.decode(json.loads(ulist["expression"]))
|
|
57
|
+
ulist_deep = deepcopy(ulist)
|
|
58
|
+
ulist_deep["expression"] = ee_image
|
|
59
|
+
images_bytes = ee.data.computePixels(ulist_deep)
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError("Manifest must contain 'assetId' or 'expression'")
|
|
62
|
+
|
|
63
|
+
full_outname.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
with open(full_outname, "wb") as f:
|
|
65
|
+
f.write(images_bytes)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def download_manifests(
|
|
69
|
+
manifests: list[dict[str, Any]],
|
|
70
|
+
full_outname: pathlib.Path,
|
|
71
|
+
max_workers: int = 1,
|
|
72
|
+
) -> None:
|
|
73
|
+
"""
|
|
74
|
+
Download multiple manifests concurrently and merge into one file.
|
|
75
|
+
|
|
76
|
+
Uses a temporary workspace that is automatically cleaned up.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
manifests: List of Earth Engine manifests
|
|
80
|
+
full_outname: Final destination path for merged TIFF
|
|
81
|
+
max_workers: Number of parallel download threads
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
ee.ee_exception.EEException: If any download fails
|
|
85
|
+
ValueError: If merge fails
|
|
86
|
+
"""
|
|
87
|
+
with temp_workspace() as tmp_dir:
|
|
88
|
+
tile_dir = tmp_dir / full_outname.stem
|
|
89
|
+
tile_dir.mkdir(parents=True, exist_ok=True)
|
|
90
|
+
|
|
91
|
+
# Download tiles in parallel
|
|
92
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
93
|
+
futures = {
|
|
94
|
+
executor.submit(
|
|
95
|
+
download_manifest,
|
|
96
|
+
ulist=manifest,
|
|
97
|
+
full_outname=tile_dir / f"{idx:06d}.tif"
|
|
98
|
+
): idx
|
|
99
|
+
for idx, manifest in enumerate(manifests)
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
for future in as_completed(futures):
|
|
103
|
+
try:
|
|
104
|
+
future.result()
|
|
105
|
+
except Exception as exc:
|
|
106
|
+
idx = futures[future]
|
|
107
|
+
print(f"Error downloading tile {idx}: {exc}")
|
|
108
|
+
raise
|
|
109
|
+
|
|
110
|
+
# Merge tiles
|
|
111
|
+
input_files = sorted(tile_dir.glob("*.tif"))
|
|
112
|
+
if not input_files:
|
|
113
|
+
raise ValueError(f"No tiles downloaded in {tile_dir}")
|
|
114
|
+
|
|
115
|
+
merge_tifs(input_files, full_outname)
|