cubexpress 0.1.18__tar.gz → 0.1.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cubexpress
3
- Version: 0.1.18
3
+ Version: 0.1.21
4
4
  Summary: Efficient processing of cubic Earth-observation (EO) data.
5
5
  Home-page: https://github.com/andesdatacube/cubexpress
6
6
  Keywords: earth-engine,sentinel-2,geospatial,eo,cube
@@ -16,13 +16,14 @@ Classifier: Programming Language :: Python :: 3.12
16
16
  Classifier: Programming Language :: Python :: 3 :: Only
17
17
  Classifier: Topic :: Scientific/Engineering :: GIS
18
18
  Requires-Dist: earthengine-api (>=1.5.12)
19
- Requires-Dist: numpy (>=2.0.2)
20
- Requires-Dist: pandas (>=2.2.2)
19
+ Requires-Dist: numpy (>=1.22.4,<2.0)
20
+ Requires-Dist: pandas (>=2.0.0)
21
21
  Requires-Dist: pyarrow (>=14.0.0)
22
- Requires-Dist: pydantic (>=2.11.4)
22
+ Requires-Dist: pydantic (>=2.0.0)
23
23
  Requires-Dist: pygeohash (>=1.2.0)
24
24
  Requires-Dist: pyproj (>=3.6.0)
25
25
  Requires-Dist: rasterio (>=1.3.9)
26
+ Requires-Dist: tqdm (>=4.65.0)
26
27
  Requires-Dist: utm (>=0.7.0)
27
28
  Project-URL: Documentation, https://andesdatacube.github.io/cubexpress
28
29
  Project-URL: Repository, https://github.com/andesdatacube/cubexpress
@@ -0,0 +1,34 @@
1
+ """
2
+ CubExpress - Efficient Earth Engine data download and processing.
3
+
4
+ Main components:
5
+ - lonlat2rt: Convert coordinates to raster transforms
6
+ - s2_table: Query Sentinel-2 metadata with cloud scores
7
+ - table_to_requestset: Build request sets from metadata
8
+ - get_cube: Download Earth Engine data cubes
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from cubexpress.cloud_utils import s2_table
14
+ from cubexpress.conversion import geo2utm, lonlat2rt
15
+ from cubexpress.cube import get_cube
16
+ from cubexpress.geotyping import RasterTransform, Request, RequestSet
17
+ from cubexpress.request import table_to_requestset
18
+
19
+ __all__ = [
20
+ "lonlat2rt",
21
+ "geo2utm",
22
+ "RasterTransform",
23
+ "Request",
24
+ "RequestSet",
25
+ "s2_table",
26
+ "table_to_requestset",
27
+ "get_cube",
28
+ ]
29
+
30
+ try:
31
+ from importlib.metadata import version
32
+ __version__ = version("cubexpress")
33
+ except Exception:
34
+ __version__ = "0.0.0-dev"
@@ -0,0 +1,76 @@
1
+ """Caching utilities for Earth Engine query results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ import pathlib
8
+
9
+ from cubexpress.config import CACHE_DIR
10
+
11
+ CACHE_DIR.mkdir(exist_ok=True, parents=True)
12
+
13
+
14
+ def _cache_key(
15
+ lon: float,
16
+ lat: float,
17
+ edge_size: int | tuple[int, int],
18
+ scale: int,
19
+ collection: str,
20
+ ) -> pathlib.Path:
21
+ """
22
+ Generate a deterministic cache file path for query parameters.
23
+
24
+ Coordinates are rounded to 4 decimal places (~11m precision) to
25
+ ensure cache hits for equivalent locations.
26
+
27
+ Args:
28
+ lon: Longitude of center point
29
+ lat: Latitude of center point
30
+ edge_size: ROI size in pixels
31
+ scale: Pixel resolution in meters
32
+ collection: Earth Engine collection ID
33
+
34
+ Returns:
35
+ Path to hashed .parquet cache file
36
+ """
37
+ lon_r = round(lon, 4)
38
+ lat_r = round(lat, 4)
39
+
40
+ edge_tuple = (
41
+ (edge_size, edge_size) if isinstance(edge_size, int)
42
+ else tuple(edge_size)
43
+ )
44
+
45
+ signature = [lon_r, lat_r, edge_tuple, scale, collection]
46
+
47
+ raw = json.dumps(signature, sort_keys=True).encode("utf-8")
48
+ digest = hashlib.md5(raw).hexdigest()
49
+
50
+ return CACHE_DIR / f"{digest}.parquet"
51
+
52
+
53
+ def clear_cache() -> int:
54
+ """
55
+ Remove all cached query results.
56
+
57
+ Returns:
58
+ Number of files deleted
59
+ """
60
+ count = 0
61
+ for cache_file in CACHE_DIR.glob("*.parquet"):
62
+ cache_file.unlink()
63
+ count += 1
64
+ return count
65
+
66
+
67
+ def get_cache_size() -> tuple[int, int]:
68
+ """
69
+ Calculate total cache size.
70
+
71
+ Returns:
72
+ Tuple of (file_count, total_bytes)
73
+ """
74
+ files = list(CACHE_DIR.glob("*.parquet"))
75
+ total_bytes = sum(f.stat().st_size for f in files)
76
+ return len(files), total_bytes
@@ -1,14 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import datetime as dt
4
- import sys
5
4
  import time
5
+ import warnings
6
+
6
7
  import ee
7
8
  import pandas as pd
9
+
8
10
  from cubexpress.cache import _cache_key
9
- import datetime as dt
10
11
  from cubexpress.geospatial import _square_roi
11
- import warnings
12
+
12
13
  warnings.filterwarnings('ignore', category=DeprecationWarning)
13
14
 
14
15
 
@@ -241,7 +242,9 @@ def s2_table(
241
242
  elapsed = time.time() - t0
242
243
  n_images = len(df_full)
243
244
  date_range = f"{start} to {end}"
244
- print(f"\r✅ Retrieved {n_images} images from {date_range} ({elapsed:.2f}s)")
245
+ actual_start = df_full['date'].min()
246
+ actual_end = df_full['date'].max()
247
+ print(f"\r✅ Retrieved {n_images} images from {actual_start} to {actual_end} ({elapsed:.2f}s)")
245
248
 
246
249
  # Save cache
247
250
  if cache:
@@ -0,0 +1,147 @@
1
+ """Coordinate conversion and raster transform utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import utm
6
+ from pyproj import CRS, Transformer
7
+
8
+ from cubexpress.exceptions import ValidationError
9
+ from cubexpress.geotyping import RasterTransform
10
+
11
+
12
+ def parse_edge_size(edge_size: int | tuple[int, int]) -> tuple[int, int]:
13
+ """
14
+ Parse edge_size input into (width, height) tuple.
15
+
16
+ Args:
17
+ edge_size: Size specification (int for square, tuple for rectangle)
18
+
19
+ Returns:
20
+ Tuple of (width, height) in pixels
21
+
22
+ Raises:
23
+ ValidationError: If input is invalid
24
+ """
25
+ if isinstance(edge_size, int):
26
+ if edge_size <= 0:
27
+ raise ValidationError(f"edge_size must be positive, got {edge_size}")
28
+ return (edge_size, edge_size)
29
+
30
+ if len(edge_size) != 2:
31
+ raise ValidationError(
32
+ f"edge_size tuple must have 2 elements, got {len(edge_size)}"
33
+ )
34
+
35
+ width, height = edge_size
36
+ if width <= 0 or height <= 0:
37
+ raise ValidationError(
38
+ f"edge_size values must be positive, got {edge_size}"
39
+ )
40
+
41
+ return (width, height)
42
+
43
+
44
+ def geo2utm(lon: float, lat: float) -> tuple[float, float, str]:
45
+ """
46
+ Convert lat/lon to UTM coordinates and EPSG code.
47
+
48
+ Uses the utm library for standard conversion.
49
+
50
+ Args:
51
+ lon: Longitude in decimal degrees
52
+ lat: Latitude in decimal degrees
53
+
54
+ Returns:
55
+ Tuple of (x, y, epsg_code) where EPSG code is formatted as 'EPSG:XXXXX'
56
+
57
+ Raises:
58
+ utm.OutOfRangeError: If coordinates are outside valid UTM range
59
+ """
60
+ x, y, zone, _ = utm.from_latlon(lat, lon)
61
+ epsg_code = f"326{zone:02d}" if lat >= 0 else f"327{zone:02d}"
62
+ return float(x), float(y), f"EPSG:{epsg_code}"
63
+
64
+
65
+ def lonlat2rt_utm_or_ups(lon: float, lat: float) -> tuple[float, float, str]:
66
+ """
67
+ Calculate UTM coordinates using pyproj (fallback for geo2utm).
68
+
69
+ This method is more robust than the utm library and works globally,
70
+ including near the poles. Uses standard UTM zones for all latitudes
71
+ to match Google Earth Engine behavior.
72
+
73
+ Args:
74
+ lon: Longitude in decimal degrees
75
+ lat: Latitude in decimal degrees
76
+
77
+ Returns:
78
+ Tuple of (x, y, epsg_code)
79
+ """
80
+ zone = int((lon + 180) // 6) + 1
81
+ epsg_code = 32600 + zone if lat >= 0 else 32700 + zone
82
+ crs = CRS.from_epsg(epsg_code)
83
+
84
+ transformer = Transformer.from_crs(4326, crs, always_xy=True)
85
+ x, y = transformer.transform(lon, lat)
86
+
87
+ return float(x), float(y), f"EPSG:{epsg_code}"
88
+
89
+
90
+ def lonlat2rt(
91
+ lon: float,
92
+ lat: float,
93
+ edge_size: int | tuple[int, int],
94
+ scale: int
95
+ ) -> RasterTransform:
96
+ """
97
+ Generate a RasterTransform from geographic coordinates.
98
+
99
+ Converts (lon, lat) to UTM projection and builds geospatial metadata
100
+ including affine transformation parameters. The Y-scale is negative
101
+ because raster images have their origin at the top-left corner.
102
+
103
+ Args:
104
+ lon: Longitude in decimal degrees
105
+ lat: Latitude in decimal degrees
106
+ edge_size: Output raster size
107
+ - int: creates square (width=height=edge_size)
108
+ - tuple: specifies (width, height) in pixels
109
+ scale: Spatial resolution in meters per pixel
110
+
111
+ Returns:
112
+ RasterTransform with CRS, geotransform, and dimensions
113
+
114
+ Examples:
115
+ >>> rt = lonlat2rt(lon=-76.0, lat=40.0, edge_size=512, scale=30)
116
+ >>> print(rt.width, rt.height)
117
+ 512 512
118
+
119
+ >>> rt = lonlat2rt(lon=-76.0, lat=40.0, edge_size=(1024, 512), scale=30)
120
+ >>> print(rt.width, rt.height)
121
+ 1024 512
122
+ """
123
+ try:
124
+ x, y, crs = geo2utm(lon, lat)
125
+ except Exception:
126
+ x, y, crs = lonlat2rt_utm_or_ups(lon, lat)
127
+
128
+ width, height = parse_edge_size(edge_size)
129
+
130
+ half_width_m = (width * scale) / 2
131
+ half_height_m = (height * scale) / 2
132
+
133
+ geotransform = {
134
+ "scaleX": scale,
135
+ "shearX": 0,
136
+ "translateX": x - half_width_m,
137
+ "scaleY": -scale,
138
+ "shearY": 0,
139
+ "translateY": y + half_height_m,
140
+ }
141
+
142
+ return RasterTransform(
143
+ crs=crs,
144
+ geotransform=geotransform,
145
+ width=width,
146
+ height=height
147
+ )
@@ -0,0 +1,209 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import pathlib
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from copy import deepcopy
7
+ from typing import Any
8
+
9
+ import ee
10
+ import pandas as pd
11
+ from tqdm import tqdm
12
+
13
+ from cubexpress.downloader import download_manifest, download_manifests
14
+ from cubexpress.geospatial import calculate_cell_size, quadsplit_manifest
15
+ from cubexpress.geotyping import RequestSet
16
+ from cubexpress.logging_config import setup_logger
17
+
18
+ logger = setup_logger(__name__)
19
+
20
+
21
+ def _test_manifest_tiling(manifest: dict[str, Any]) -> int:
22
+ """
23
+ Test if a manifest requires tiling without downloading data.
24
+
25
+ Args:
26
+ manifest: Earth Engine download manifest
27
+
28
+ Returns:
29
+ Number of tiles required (1 if no tiling needed)
30
+ """
31
+ try:
32
+ if "assetId" in manifest:
33
+ _ = ee.data.getPixels(manifest)
34
+ elif "expression" in manifest:
35
+ ee_image = ee.deserializer.decode(json.loads(manifest["expression"]))
36
+ manifest_copy = deepcopy(manifest)
37
+ manifest_copy["expression"] = ee_image
38
+ _ = ee.data.computePixels(manifest_copy)
39
+
40
+ return 1
41
+
42
+ except ee.ee_exception.EEException as err:
43
+ size = manifest["grid"]["dimensions"]["width"]
44
+ cell_w, cell_h, power = calculate_cell_size(str(err), size)
45
+ n_tiles = (2 ** power) ** 2
46
+ return n_tiles
47
+
48
+
49
+ def get_geotiff(
50
+ manifest: dict[str, Any],
51
+ full_outname: pathlib.Path | str,
52
+ nworks: int,
53
+ return_tile_info: bool = False,
54
+ ) -> int | None:
55
+ """
56
+ Download a single GeoTIFF with automatic tiling if needed.
57
+
58
+ Args:
59
+ manifest: Earth Engine download manifest
60
+ full_outname: Output path for final GeoTIFF
61
+ nworks: Number of worker threads for tiling
62
+ return_tile_info: If True, return number of tiles created
63
+
64
+ Returns:
65
+ Number of tiles if return_tile_info=True, otherwise None
66
+ """
67
+ try:
68
+ download_manifest(ulist=manifest, full_outname=full_outname)
69
+ return 1 if return_tile_info else None
70
+
71
+ except ee.ee_exception.EEException as err:
72
+ size = manifest["grid"]["dimensions"]["width"]
73
+ cell_w, cell_h, power = calculate_cell_size(str(err), size)
74
+
75
+ tiled = quadsplit_manifest(manifest, cell_w, cell_h, power)
76
+ n_tiles = len(tiled)
77
+
78
+ # Silent tiling - no log spam
79
+ download_manifests(
80
+ manifests=tiled,
81
+ full_outname=full_outname,
82
+ max_workers=nworks
83
+ )
84
+
85
+ return n_tiles if return_tile_info else None
86
+
87
+
88
+ def _detect_optimal_workers(
89
+ first_manifest: dict[str, Any],
90
+ total_workers: int
91
+ ) -> tuple[int, int]:
92
+ """
93
+ Detect optimal worker distribution by testing first image.
94
+
95
+ Args:
96
+ first_manifest: Manifest of first image
97
+ total_workers: Total workers to distribute
98
+
99
+ Returns:
100
+ Tuple of (outer_workers, inner_workers)
101
+ """
102
+ n_tiles = _test_manifest_tiling(first_manifest)
103
+
104
+ if n_tiles == 1:
105
+ outer, inner = total_workers, 1
106
+ logger.debug(f"No tiling needed - using {outer} parallel images")
107
+ else:
108
+ inner = min(n_tiles, max(1, total_workers // 2))
109
+ outer = max(1, total_workers // inner)
110
+ logger.info(
111
+ f"Auto-detected tiling required ({n_tiles} tiles/image) - "
112
+ f"using outer={outer}, inner={inner}"
113
+ )
114
+
115
+ return outer, inner
116
+
117
+
118
+ def get_cube(
119
+ requests: pd.DataFrame | RequestSet,
120
+ outfolder: pathlib.Path | str,
121
+ nworks: int | tuple[int, int] = 4,
122
+ auto_workers: bool = True
123
+ ) -> None:
124
+ """
125
+ Download a set of Earth Engine requests in parallel.
126
+
127
+ Args:
128
+ requests: Collection of requests (DataFrame or RequestSet)
129
+ outfolder: Destination directory
130
+ nworks: Worker configuration (int or tuple of (outer, inner))
131
+ auto_workers: If True, automatically detect optimal distribution
132
+
133
+ Raises:
134
+ ValueError: If nworks configuration is invalid
135
+ TypeError: If nworks has wrong type
136
+ """
137
+ outfolder = pathlib.Path(outfolder).expanduser().resolve()
138
+ outfolder.mkdir(parents=True, exist_ok=True)
139
+
140
+ dataframe = (
141
+ requests._dataframe if isinstance(requests, RequestSet)
142
+ else requests
143
+ )
144
+
145
+ if dataframe.empty:
146
+ logger.warning("Request set is empty")
147
+ return
148
+
149
+ # Determine worker configuration
150
+ if isinstance(nworks, int):
151
+ if nworks <= 0:
152
+ raise ValueError(f"nworks must be positive, got {nworks}")
153
+
154
+ if auto_workers:
155
+ first_row = dataframe.iloc[0]
156
+ nworks_outer, nworks_inner = _detect_optimal_workers(
157
+ first_manifest=first_row.manifest,
158
+ total_workers=nworks
159
+ )
160
+ else:
161
+ nworks_outer, nworks_inner = nworks, 1
162
+
163
+ elif isinstance(nworks, (list, tuple)):
164
+ if len(nworks) != 2:
165
+ raise ValueError(f"nworks tuple must have 2 elements, got {len(nworks)}")
166
+
167
+ nworks_outer, nworks_inner = nworks
168
+
169
+ if not all(isinstance(n, int) for n in (nworks_outer, nworks_inner)):
170
+ raise TypeError(f"nworks elements must be integers")
171
+
172
+ if nworks_outer <= 0 or nworks_inner <= 0:
173
+ raise ValueError(f"nworks values must be positive")
174
+ else:
175
+ raise TypeError(f"nworks must be int or tuple, got {type(nworks)}")
176
+
177
+ # Execute downloads
178
+ failed = []
179
+ with ThreadPoolExecutor(max_workers=nworks_outer) as executor:
180
+ futures = {
181
+ executor.submit(
182
+ get_geotiff,
183
+ manifest=row.manifest,
184
+ full_outname=outfolder / f"{row.id}.tif",
185
+ nworks=nworks_inner,
186
+ return_tile_info=False
187
+ ): row.id
188
+ for _, row in dataframe.iterrows()
189
+ }
190
+
191
+ for future in tqdm(
192
+ as_completed(futures),
193
+ total=len(futures),
194
+ desc=f"Downloading (outer={nworks_outer}, inner={nworks_inner})",
195
+ unit="image",
196
+ leave=True
197
+ ):
198
+ img_id = futures[future]
199
+ try:
200
+ future.result()
201
+ except Exception as exc:
202
+ logger.error(f"Failed {img_id}: {exc}")
203
+ failed.append(img_id)
204
+
205
+ # Summary
206
+ if failed:
207
+ logger.warning(f"{len(failed)}/{len(dataframe)} downloads failed")
208
+ else:
209
+ logger.info(f"✓ Downloaded {len(dataframe)} images to {outfolder}")
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import pathlib
5
+ import shutil
6
+ import tempfile
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ from contextlib import contextmanager
9
+ from copy import deepcopy
10
+ from typing import Any, Iterator
11
+
12
+ import ee
13
+
14
+ from cubexpress.geospatial import merge_tifs
15
+
16
+
17
+ @contextmanager
18
+ def temp_workspace(prefix: str = "cubexpress_") -> Iterator[pathlib.Path]:
19
+ """
20
+ Create a temporary directory with automatic cleanup.
21
+
22
+ Args:
23
+ prefix: Prefix for the temporary directory name
24
+
25
+ Yields:
26
+ Path to temporary directory
27
+ """
28
+ tmp_dir = pathlib.Path(tempfile.mkdtemp(prefix=prefix))
29
+ try:
30
+ yield tmp_dir
31
+ finally:
32
+ if tmp_dir.exists():
33
+ shutil.rmtree(tmp_dir, ignore_errors=True)
34
+
35
+
36
+ def download_manifest(
37
+ ulist: dict[str, Any],
38
+ full_outname: pathlib.Path
39
+ ) -> None:
40
+ """
41
+ Download data from Earth Engine based on a manifest dictionary.
42
+
43
+ Handles both direct asset IDs and serialized EE expressions.
44
+
45
+ Args:
46
+ ulist: Export manifest containing 'assetId' or 'expression'
47
+ full_outname: Destination path for the downloaded file
48
+
49
+ Raises:
50
+ ValueError: If manifest is invalid
51
+ ee.ee_exception.EEException: If Earth Engine request fails
52
+ """
53
+ if "assetId" in ulist:
54
+ images_bytes = ee.data.getPixels(ulist)
55
+ elif "expression" in ulist:
56
+ ee_image = ee.deserializer.decode(json.loads(ulist["expression"]))
57
+ ulist_deep = deepcopy(ulist)
58
+ ulist_deep["expression"] = ee_image
59
+ images_bytes = ee.data.computePixels(ulist_deep)
60
+ else:
61
+ raise ValueError("Manifest must contain 'assetId' or 'expression'")
62
+
63
+ full_outname.parent.mkdir(parents=True, exist_ok=True)
64
+ with open(full_outname, "wb") as f:
65
+ f.write(images_bytes)
66
+
67
+
68
+ def download_manifests(
69
+ manifests: list[dict[str, Any]],
70
+ full_outname: pathlib.Path,
71
+ max_workers: int = 1,
72
+ ) -> None:
73
+ """
74
+ Download multiple manifests concurrently and merge into one file.
75
+
76
+ Uses a temporary workspace that is automatically cleaned up.
77
+
78
+ Args:
79
+ manifests: List of Earth Engine manifests
80
+ full_outname: Final destination path for merged TIFF
81
+ max_workers: Number of parallel download threads
82
+
83
+ Raises:
84
+ ee.ee_exception.EEException: If any download fails
85
+ ValueError: If merge fails
86
+ """
87
+ with temp_workspace() as tmp_dir:
88
+ tile_dir = tmp_dir / full_outname.stem
89
+ tile_dir.mkdir(parents=True, exist_ok=True)
90
+
91
+ # Download tiles in parallel
92
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
93
+ futures = {
94
+ executor.submit(
95
+ download_manifest,
96
+ ulist=manifest,
97
+ full_outname=tile_dir / f"{idx:06d}.tif"
98
+ ): idx
99
+ for idx, manifest in enumerate(manifests)
100
+ }
101
+
102
+ for future in as_completed(futures):
103
+ try:
104
+ future.result()
105
+ except Exception as exc:
106
+ idx = futures[future]
107
+ print(f"Error downloading tile {idx}: {exc}")
108
+ raise
109
+
110
+ # Merge tiles
111
+ input_files = sorted(tile_dir.glob("*.tif"))
112
+ if not input_files:
113
+ raise ValueError(f"No tiles downloaded in {tile_dir}")
114
+
115
+ merge_tifs(input_files, full_outname)