cubexpress 0.1.4__tar.gz → 0.1.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cubexpress
3
- Version: 0.1.4
3
+ Version: 0.1.21
4
4
  Summary: Efficient processing of cubic Earth-observation (EO) data.
5
5
  Home-page: https://github.com/andesdatacube/cubexpress
6
- License: MIT
7
6
  Keywords: earth-engine,sentinel-2,geospatial,eo,cube
8
7
  Author: Julio Contreras
9
8
  Author-email: contrerasnetk@gmail.com
10
- Requires-Python: >=3.9,<4.0
9
+ Requires-Python: >=3.9
11
10
  Classifier: License :: OSI Approved :: MIT License
12
11
  Classifier: Programming Language :: Python :: 3
13
12
  Classifier: Programming Language :: Python :: 3.9
@@ -16,14 +15,16 @@ Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Programming Language :: Python :: 3.12
17
16
  Classifier: Programming Language :: Python :: 3 :: Only
18
17
  Classifier: Topic :: Scientific/Engineering :: GIS
19
- Requires-Dist: earthengine-api (>=0.1.392)
20
- Requires-Dist: numpy (>=1.25.2)
21
- Requires-Dist: pandas (>=2.0.3)
18
+ Requires-Dist: earthengine-api (>=1.5.12)
19
+ Requires-Dist: numpy (>=1.22.4,<2.0)
20
+ Requires-Dist: pandas (>=2.0.0)
22
21
  Requires-Dist: pyarrow (>=14.0.0)
23
- Requires-Dist: pygeohash (>=1.2.0,<2.0.0)
22
+ Requires-Dist: pydantic (>=2.0.0)
23
+ Requires-Dist: pygeohash (>=1.2.0)
24
24
  Requires-Dist: pyproj (>=3.6.0)
25
25
  Requires-Dist: rasterio (>=1.3.9)
26
- Requires-Dist: utm (>=0.7.0,<0.9.0)
26
+ Requires-Dist: tqdm (>=4.65.0)
27
+ Requires-Dist: utm (>=0.7.0)
27
28
  Project-URL: Documentation, https://andesdatacube.github.io/cubexpress
28
29
  Project-URL: Repository, https://github.com/andesdatacube/cubexpress
29
30
  Description-Content-Type: text/markdown
@@ -31,7 +32,7 @@ Description-Content-Type: text/markdown
31
32
  <h1></h1>
32
33
 
33
34
  <p align="center">
34
- <img src="./docs/logo_cubexpress.png" width="39%">
35
+ <img src="https://raw.githubusercontent.com/andesdatacube/cubexpress/refs/heads/main/docs/logo_cubexpress.png" width="39%">
35
36
  </p>
36
37
 
37
38
  <p align="center">
@@ -1,7 +1,7 @@
1
1
  <h1></h1>
2
2
 
3
3
  <p align="center">
4
- <img src="./docs/logo_cubexpress.png" width="39%">
4
+ <img src="https://raw.githubusercontent.com/andesdatacube/cubexpress/refs/heads/main/docs/logo_cubexpress.png" width="39%">
5
5
  </p>
6
6
 
7
7
  <p align="center">
@@ -0,0 +1,34 @@
1
+ """
2
+ CubExpress - Efficient Earth Engine data download and processing.
3
+
4
+ Main components:
5
+ - lonlat2rt: Convert coordinates to raster transforms
6
+ - s2_table: Query Sentinel-2 metadata with cloud scores
7
+ - table_to_requestset: Build request sets from metadata
8
+ - get_cube: Download Earth Engine data cubes
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from cubexpress.cloud_utils import s2_table
14
+ from cubexpress.conversion import geo2utm, lonlat2rt
15
+ from cubexpress.cube import get_cube
16
+ from cubexpress.geotyping import RasterTransform, Request, RequestSet
17
+ from cubexpress.request import table_to_requestset
18
+
19
+ __all__ = [
20
+ "lonlat2rt",
21
+ "geo2utm",
22
+ "RasterTransform",
23
+ "Request",
24
+ "RequestSet",
25
+ "s2_table",
26
+ "table_to_requestset",
27
+ "get_cube",
28
+ ]
29
+
30
+ try:
31
+ from importlib.metadata import version
32
+ __version__ = version("cubexpress")
33
+ except Exception:
34
+ __version__ = "0.0.0-dev"
@@ -0,0 +1,76 @@
1
+ """Caching utilities for Earth Engine query results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ import pathlib
8
+
9
+ from cubexpress.config import CACHE_DIR
10
+
11
+ CACHE_DIR.mkdir(exist_ok=True, parents=True)
12
+
13
+
14
+ def _cache_key(
15
+ lon: float,
16
+ lat: float,
17
+ edge_size: int | tuple[int, int],
18
+ scale: int,
19
+ collection: str,
20
+ ) -> pathlib.Path:
21
+ """
22
+ Generate a deterministic cache file path for query parameters.
23
+
24
+ Coordinates are rounded to 4 decimal places (~11m precision) to
25
+ ensure cache hits for equivalent locations.
26
+
27
+ Args:
28
+ lon: Longitude of center point
29
+ lat: Latitude of center point
30
+ edge_size: ROI size in pixels
31
+ scale: Pixel resolution in meters
32
+ collection: Earth Engine collection ID
33
+
34
+ Returns:
35
+ Path to hashed .parquet cache file
36
+ """
37
+ lon_r = round(lon, 4)
38
+ lat_r = round(lat, 4)
39
+
40
+ edge_tuple = (
41
+ (edge_size, edge_size) if isinstance(edge_size, int)
42
+ else tuple(edge_size)
43
+ )
44
+
45
+ signature = [lon_r, lat_r, edge_tuple, scale, collection]
46
+
47
+ raw = json.dumps(signature, sort_keys=True).encode("utf-8")
48
+ digest = hashlib.md5(raw).hexdigest()
49
+
50
+ return CACHE_DIR / f"{digest}.parquet"
51
+
52
+
53
+ def clear_cache() -> int:
54
+ """
55
+ Remove all cached query results.
56
+
57
+ Returns:
58
+ Number of files deleted
59
+ """
60
+ count = 0
61
+ for cache_file in CACHE_DIR.glob("*.parquet"):
62
+ cache_file.unlink()
63
+ count += 1
64
+ return count
65
+
66
+
67
+ def get_cache_size() -> tuple[int, int]:
68
+ """
69
+ Calculate total cache size.
70
+
71
+ Returns:
72
+ Tuple of (file_count, total_bytes)
73
+ """
74
+ files = list(CACHE_DIR.glob("*.parquet"))
75
+ total_bytes = sum(f.stat().st_size for f in files)
76
+ return len(files), total_bytes
@@ -0,0 +1,271 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime as dt
4
+ import time
5
+ import warnings
6
+
7
+ import ee
8
+ import pandas as pd
9
+
10
+ from cubexpress.cache import _cache_key
11
+ from cubexpress.geospatial import _square_roi
12
+
13
+ warnings.filterwarnings('ignore', category=DeprecationWarning)
14
+
15
+
16
+ # --- CONFIGURATION CONSTANTS ---
17
+ S2_COLLECTION = "COPERNICUS/S2_HARMONIZED"
18
+ S2_CLOUD_COLLECTION = "GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED"
19
+ S2_BANDS = [
20
+ "B1", "B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B9", "B10", "B11", "B12"
21
+ ]
22
+ S2_PIXEL_SCALE = 10 # meters
23
+ # -------------------------------
24
+
25
+ def _cloud_table_single_range(
26
+ lon: float,
27
+ lat: float,
28
+ edge_size: int | tuple[int, int],
29
+ start: str,
30
+ end: str
31
+ ) -> pd.DataFrame:
32
+ """
33
+ Build a daily cloud-score table for a square Sentinel-2 footprint.
34
+
35
+ Query Earth Engine for a specific date range, identifying which images
36
+ fully contain the ROI and filling missing cloud scores with daily means.
37
+
38
+ Args:
39
+ lon (float): Longitude of the center point.
40
+ lat (float): Latitude of the center point.
41
+ edge_size (int | tuple[int, int]): Side length of the square region
42
+ in Sentinel-2 pixels (10 m each).
43
+ start (str): ISO-8601 start date (inclusive), e.g. "2024-06-01".
44
+ end (str): ISO-8601 end date (inclusive).
45
+
46
+ Returns:
47
+ pd.DataFrame: A DataFrame with one row per image. Columns include:
48
+ * id: Sentinel-2 ID.
49
+ * cs_cdf: Cloud Score Plus CDF (0—1).
50
+ * date: Acquisition date (YYYY-MM-DD).
51
+ * inside: 1 if the image fully contains the ROI, 0 otherwise.
52
+
53
+ Note: Missing ``cs_cdf`` values are filled with the mean of the
54
+ same day if a full-coverage image is not available.
55
+
56
+ Raises:
57
+ ee.ee_exception.EEException: If Earth Engine fails for reasons other
58
+ than an empty collection (e.g., quota exceeded, bad request).
59
+ """
60
+ # Define ROI (bbox around point)
61
+ center = ee.Geometry.Point([lon, lat])
62
+ roi = _square_roi(lon, lat, edge_size, 10)
63
+
64
+ # Query S2
65
+ s2 = (
66
+ ee.ImageCollection(S2_COLLECTION)
67
+ .filterBounds(roi)
68
+ .filterDate(start, end)
69
+ )
70
+
71
+ # Cloud Score Plus collection
72
+ ic = (
73
+ s2
74
+ .linkCollection(
75
+ ee.ImageCollection(S2_CLOUD_COLLECTION),
76
+ ["cs_cdf"]
77
+ )
78
+ .select(["cs_cdf"])
79
+ )
80
+
81
+ # Identify images whose footprint contains the ROI
82
+ ids_inside = (
83
+ ic
84
+ .map(
85
+ lambda img: img.set(
86
+ 'roi_inside_scene',
87
+ img.geometry().contains(roi, maxError=10)
88
+ )
89
+ )
90
+ .filter(ee.Filter.eq('roi_inside_scene', True))
91
+ .aggregate_array('system:index')
92
+ .getInfo()
93
+ )
94
+
95
+ # Generate % cloud of each image over the ROI
96
+ try:
97
+ raw = ic.getRegion(
98
+ geometry=center,
99
+ scale=(edge_size) * 11 # 10 m pixels plus margin (it's a tricky calculation)
100
+ ).getInfo()
101
+ except ee.ee_exception.EEException as e:
102
+ if "No bands in collection" in str(e):
103
+ return pd.DataFrame(
104
+ columns=["id", "longitude", "latitude", "time", "cs_cdf", "inside"]
105
+ )
106
+ raise e
107
+
108
+ # Convert raw data to DataFrame
109
+ df_raw = (
110
+ pd.DataFrame(raw[1:], columns=raw[0])
111
+ .drop(columns=["longitude", "latitude"])
112
+ .assign(
113
+ date=lambda d: pd.to_datetime(d["id"].str[:8], format="%Y%m%d").dt.strftime("%Y-%m-%d")
114
+ )
115
+ )
116
+
117
+ # Mark images whose ROI is fully inside the scene
118
+ df_raw["inside"] = df_raw["id"].isin(set(ids_inside)).astype(int)
119
+
120
+ # Fill missing cloud scores with daily mean (mosaic approach)
121
+ df_raw['cs_cdf'] = df_raw.groupby('date').apply(
122
+ lambda group: group['cs_cdf'].transform(
123
+ lambda _: group[group['inside'] == 1]['cs_cdf'].iloc[0]
124
+ if (group['inside'] == 1).any()
125
+ else group['cs_cdf'].mean()
126
+ )
127
+ ).reset_index(drop=True)
128
+
129
+ return df_raw
130
+
131
+ def s2_table(
132
+ lon: float,
133
+ lat: float,
134
+ edge_size: int | tuple[int, int],
135
+ start: str,
136
+ end: str,
137
+ max_cscore: float = 1.0,
138
+ min_cscore: float = 0.0,
139
+ cache: bool = False
140
+ ) -> pd.DataFrame:
141
+ """
142
+ Build (and cache) a per-day cloud-table for the requested ROI.
143
+
144
+ The function checks an on-disk parquet cache keyed on location and
145
+ parameters. If parts of the requested date-range are missing, it fetches
146
+ only those gaps from Earth Engine, merges them, updates the cache, and
147
+ finally filters by cloud score thresholds.
148
+
149
+ Args:
150
+ lon (float): Longitude of the center point.
151
+ lat (float): Latitude of the center point.
152
+ edge_size (int | tuple[int, int]): Side length of the square region
153
+ in Sentinel-2 pixels (10 m each).
154
+ start (str): ISO-8601 start date, e.g. "2024-06-01".
155
+ end (str): ISO-8601 end date.
156
+ max_cscore (float, optional): Maximum allowed cloud score CDF (0.0 to 1.0).
157
+ Rows above this threshold are dropped. Defaults to 1.0.
158
+ min_cscore (float, optional): Minimum allowed cloud score CDF (0.0 to 1.0).
159
+ Defaults to 0.0.
160
+ cache (bool, optional): If True, enables on-disk parquet caching to
161
+ avoid re-fetching data for the same parameters. Defaults to False.
162
+
163
+ Returns:
164
+ pd.DataFrame: Filtered cloud table. The DataFrame contains useful
165
+ metadata in ``.attrs`` (bands, collection, scale, etc.) needed
166
+ for downstream functions.
167
+ """
168
+ cache_file = _cache_key(lon, lat, edge_size, S2_PIXEL_SCALE, S2_COLLECTION)
169
+
170
+ # Load cached data if present
171
+ if cache and cache_file.exists():
172
+ print("📂 Loading cached metadata...", end='', flush=True)
173
+ t0 = time.time()
174
+ df_cached = pd.read_parquet(cache_file)
175
+ have_idx = pd.to_datetime(df_cached["date"], errors="coerce").dropna()
176
+
177
+ cached_start = have_idx.min().date()
178
+ cached_end = have_idx.max().date()
179
+ elapsed = time.time() - t0
180
+
181
+ if (
182
+ dt.date.fromisoformat(start) >= cached_start
183
+ and dt.date.fromisoformat(end) <= cached_end
184
+ ):
185
+ print(f"\r✅ Loaded {len(df_cached)} images from cache ({elapsed:.2f}s)")
186
+ df_full = df_cached
187
+ else:
188
+ print(f"\r📂 Cache loaded ({len(df_cached)} images, {elapsed:.2f}s)")
189
+
190
+ # Identify missing segments and fetch only those.
191
+ print("⏳ Fetching missing date ranges...", end='', flush=True)
192
+ t0 = time.time()
193
+ df_new_parts = []
194
+
195
+ if dt.date.fromisoformat(start) < cached_start:
196
+ a1, b1 = start, cached_start.isoformat()
197
+ df_new_parts.append(
198
+ _cloud_table_single_range(
199
+ lon=lon,
200
+ lat=lat,
201
+ edge_size=edge_size,
202
+ start=a1,
203
+ end=b1
204
+ )
205
+ )
206
+ if dt.date.fromisoformat(end) > cached_end:
207
+ a2, b2 = cached_end.isoformat(), end
208
+ df_new_parts.append(
209
+ _cloud_table_single_range(
210
+ lon=lon,
211
+ lat=lat,
212
+ edge_size=edge_size,
213
+ start=a2,
214
+ end=b2
215
+ )
216
+ )
217
+ df_new_parts = [df for df in df_new_parts if not df.empty]
218
+
219
+ if df_new_parts:
220
+ df_new = pd.concat(df_new_parts, ignore_index=True)
221
+ elapsed = time.time() - t0
222
+ print(f"\r✅ Fetched {len(df_new)} new images ({elapsed:.2f}s) ")
223
+
224
+ df_full = (
225
+ pd.concat([df_cached, df_new], ignore_index=True)
226
+ .sort_values("date", kind="mergesort")
227
+ )
228
+ else:
229
+ elapsed = time.time() - t0
230
+ print(f"\r✅ No new images needed ({elapsed:.2f}s) ")
231
+ df_full = df_cached
232
+ else:
233
+ print("⏳ Querying Earth Engine metadata...", end='', flush=True)
234
+ t0 = time.time()
235
+ df_full = _cloud_table_single_range(
236
+ lon=lon,
237
+ lat=lat,
238
+ edge_size=edge_size,
239
+ start=start,
240
+ end=end
241
+ )
242
+ elapsed = time.time() - t0
243
+ n_images = len(df_full)
244
+ date_range = f"{start} to {end}"
245
+ actual_start = df_full['date'].min()
246
+ actual_end = df_full['date'].max()
247
+ print(f"\r✅ Retrieved {n_images} images from {actual_start} to {actual_end} ({elapsed:.2f}s)")
248
+
249
+ # Save cache
250
+ if cache:
251
+ df_full.to_parquet(cache_file, compression="zstd")
252
+
253
+ # Filter by cloud cover and requested date window
254
+ result = (
255
+ df_full.query("@start <= date <= @end")
256
+ .query("@min_cscore <= cs_cdf <= @max_cscore")
257
+ .reset_index(drop=True)
258
+ )
259
+
260
+ # Attach metadata for downstream helpers
261
+ result.attrs.update(
262
+ {
263
+ "lon": lon,
264
+ "lat": lat,
265
+ "edge_size": edge_size,
266
+ "scale": S2_PIXEL_SCALE,
267
+ "bands": S2_BANDS,
268
+ "collection": S2_COLLECTION
269
+ }
270
+ )
271
+ return result
@@ -0,0 +1,147 @@
1
+ """Coordinate conversion and raster transform utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import utm
6
+ from pyproj import CRS, Transformer
7
+
8
+ from cubexpress.exceptions import ValidationError
9
+ from cubexpress.geotyping import RasterTransform
10
+
11
+
12
+ def parse_edge_size(edge_size: int | tuple[int, int]) -> tuple[int, int]:
13
+ """
14
+ Parse edge_size input into (width, height) tuple.
15
+
16
+ Args:
17
+ edge_size: Size specification (int for square, tuple for rectangle)
18
+
19
+ Returns:
20
+ Tuple of (width, height) in pixels
21
+
22
+ Raises:
23
+ ValidationError: If input is invalid
24
+ """
25
+ if isinstance(edge_size, int):
26
+ if edge_size <= 0:
27
+ raise ValidationError(f"edge_size must be positive, got {edge_size}")
28
+ return (edge_size, edge_size)
29
+
30
+ if len(edge_size) != 2:
31
+ raise ValidationError(
32
+ f"edge_size tuple must have 2 elements, got {len(edge_size)}"
33
+ )
34
+
35
+ width, height = edge_size
36
+ if width <= 0 or height <= 0:
37
+ raise ValidationError(
38
+ f"edge_size values must be positive, got {edge_size}"
39
+ )
40
+
41
+ return (width, height)
42
+
43
+
44
+ def geo2utm(lon: float, lat: float) -> tuple[float, float, str]:
45
+ """
46
+ Convert lat/lon to UTM coordinates and EPSG code.
47
+
48
+ Uses the utm library for standard conversion.
49
+
50
+ Args:
51
+ lon: Longitude in decimal degrees
52
+ lat: Latitude in decimal degrees
53
+
54
+ Returns:
55
+ Tuple of (x, y, epsg_code) where EPSG code is formatted as 'EPSG:XXXXX'
56
+
57
+ Raises:
58
+ utm.OutOfRangeError: If coordinates are outside valid UTM range
59
+ """
60
+ x, y, zone, _ = utm.from_latlon(lat, lon)
61
+ epsg_code = f"326{zone:02d}" if lat >= 0 else f"327{zone:02d}"
62
+ return float(x), float(y), f"EPSG:{epsg_code}"
63
+
64
+
65
+ def lonlat2rt_utm_or_ups(lon: float, lat: float) -> tuple[float, float, str]:
66
+ """
67
+ Calculate UTM coordinates using pyproj (fallback for geo2utm).
68
+
69
+ This method is more robust than the utm library and works globally,
70
+ including near the poles. Uses standard UTM zones for all latitudes
71
+ to match Google Earth Engine behavior.
72
+
73
+ Args:
74
+ lon: Longitude in decimal degrees
75
+ lat: Latitude in decimal degrees
76
+
77
+ Returns:
78
+ Tuple of (x, y, epsg_code)
79
+ """
80
+ zone = int((lon + 180) // 6) + 1
81
+ epsg_code = 32600 + zone if lat >= 0 else 32700 + zone
82
+ crs = CRS.from_epsg(epsg_code)
83
+
84
+ transformer = Transformer.from_crs(4326, crs, always_xy=True)
85
+ x, y = transformer.transform(lon, lat)
86
+
87
+ return float(x), float(y), f"EPSG:{epsg_code}"
88
+
89
+
90
+ def lonlat2rt(
91
+ lon: float,
92
+ lat: float,
93
+ edge_size: int | tuple[int, int],
94
+ scale: int
95
+ ) -> RasterTransform:
96
+ """
97
+ Generate a RasterTransform from geographic coordinates.
98
+
99
+ Converts (lon, lat) to UTM projection and builds geospatial metadata
100
+ including affine transformation parameters. The Y-scale is negative
101
+ because raster images have their origin at the top-left corner.
102
+
103
+ Args:
104
+ lon: Longitude in decimal degrees
105
+ lat: Latitude in decimal degrees
106
+ edge_size: Output raster size
107
+ - int: creates square (width=height=edge_size)
108
+ - tuple: specifies (width, height) in pixels
109
+ scale: Spatial resolution in meters per pixel
110
+
111
+ Returns:
112
+ RasterTransform with CRS, geotransform, and dimensions
113
+
114
+ Examples:
115
+ >>> rt = lonlat2rt(lon=-76.0, lat=40.0, edge_size=512, scale=30)
116
+ >>> print(rt.width, rt.height)
117
+ 512 512
118
+
119
+ >>> rt = lonlat2rt(lon=-76.0, lat=40.0, edge_size=(1024, 512), scale=30)
120
+ >>> print(rt.width, rt.height)
121
+ 1024 512
122
+ """
123
+ try:
124
+ x, y, crs = geo2utm(lon, lat)
125
+ except Exception:
126
+ x, y, crs = lonlat2rt_utm_or_ups(lon, lat)
127
+
128
+ width, height = parse_edge_size(edge_size)
129
+
130
+ half_width_m = (width * scale) / 2
131
+ half_height_m = (height * scale) / 2
132
+
133
+ geotransform = {
134
+ "scaleX": scale,
135
+ "shearX": 0,
136
+ "translateX": x - half_width_m,
137
+ "scaleY": -scale,
138
+ "shearY": 0,
139
+ "translateY": y + half_height_m,
140
+ }
141
+
142
+ return RasterTransform(
143
+ crs=crs,
144
+ geotransform=geotransform,
145
+ width=width,
146
+ height=height
147
+ )