cubexpress 0.1.4__tar.gz → 0.1.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cubexpress
3
- Version: 0.1.4
3
+ Version: 0.1.18
4
4
  Summary: Efficient processing of cubic Earth-observation (EO) data.
5
5
  Home-page: https://github.com/andesdatacube/cubexpress
6
- License: MIT
7
6
  Keywords: earth-engine,sentinel-2,geospatial,eo,cube
8
7
  Author: Julio Contreras
9
8
  Author-email: contrerasnetk@gmail.com
10
- Requires-Python: >=3.9,<4.0
9
+ Requires-Python: >=3.9
11
10
  Classifier: License :: OSI Approved :: MIT License
12
11
  Classifier: Programming Language :: Python :: 3
13
12
  Classifier: Programming Language :: Python :: 3.9
@@ -16,14 +15,15 @@ Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Programming Language :: Python :: 3.12
17
16
  Classifier: Programming Language :: Python :: 3 :: Only
18
17
  Classifier: Topic :: Scientific/Engineering :: GIS
19
- Requires-Dist: earthengine-api (>=0.1.392)
20
- Requires-Dist: numpy (>=1.25.2)
21
- Requires-Dist: pandas (>=2.0.3)
18
+ Requires-Dist: earthengine-api (>=1.5.12)
19
+ Requires-Dist: numpy (>=2.0.2)
20
+ Requires-Dist: pandas (>=2.2.2)
22
21
  Requires-Dist: pyarrow (>=14.0.0)
23
- Requires-Dist: pygeohash (>=1.2.0,<2.0.0)
22
+ Requires-Dist: pydantic (>=2.11.4)
23
+ Requires-Dist: pygeohash (>=1.2.0)
24
24
  Requires-Dist: pyproj (>=3.6.0)
25
25
  Requires-Dist: rasterio (>=1.3.9)
26
- Requires-Dist: utm (>=0.7.0,<0.9.0)
26
+ Requires-Dist: utm (>=0.7.0)
27
27
  Project-URL: Documentation, https://andesdatacube.github.io/cubexpress
28
28
  Project-URL: Repository, https://github.com/andesdatacube/cubexpress
29
29
  Description-Content-Type: text/markdown
@@ -31,7 +31,7 @@ Description-Content-Type: text/markdown
31
31
  <h1></h1>
32
32
 
33
33
  <p align="center">
34
- <img src="./docs/logo_cubexpress.png" width="39%">
34
+ <img src="https://raw.githubusercontent.com/andesdatacube/cubexpress/refs/heads/main/docs/logo_cubexpress.png" width="39%">
35
35
  </p>
36
36
 
37
37
  <p align="center">
@@ -1,7 +1,7 @@
1
1
  <h1></h1>
2
2
 
3
3
  <p align="center">
4
- <img src="./docs/logo_cubexpress.png" width="39%">
4
+ <img src="https://raw.githubusercontent.com/andesdatacube/cubexpress/refs/heads/main/docs/logo_cubexpress.png" width="39%">
5
5
  </p>
6
6
 
7
7
  <p align="center">
@@ -1,13 +1,11 @@
1
1
  from cubexpress.conversion import lonlat2rt, geo2utm
2
2
  from cubexpress.geotyping import RasterTransform, Request, RequestSet
3
- from cubexpress.cloud_utils import cloud_table
3
+ from cubexpress.cloud_utils import s2_table
4
4
  from cubexpress.cube import get_cube
5
5
  from cubexpress.request import table_to_requestset
6
+ # import importlib.metadata
6
7
 
7
8
 
8
-
9
- # pyproj
10
- # Export the functions
11
9
  __all__ = [
12
10
  "lonlat2rt",
13
11
  "RasterTransform",
@@ -15,11 +13,7 @@ __all__ = [
15
13
  "RequestSet",
16
14
  "geo2utm",
17
15
  "get_cube",
18
- "cloud_table",
16
+ "s2_table",
19
17
  "table_to_requestset"
20
18
  ]
21
-
22
- # Dynamic version import
23
- import importlib.metadata
24
-
25
- __version__ = importlib.metadata.version("cubexpress")
19
+ # __version__ = importlib.metadata.version("cubexpress")
@@ -0,0 +1,52 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ import pathlib
5
+ from typing import Final
6
+
7
+ # Directory for storing cached metadata files (configurable via env var)
8
+ _CACHE_DIR: Final[pathlib.Path] = pathlib.Path(
9
+ os.getenv("CUBEXPRESS_CACHE", "~/.cubexpress_cache")
10
+ ).expanduser()
11
+ _CACHE_DIR.mkdir(exist_ok=True)
12
+
13
+
14
+ def _cache_key(
15
+ lon: float,
16
+ lat: float,
17
+ edge_size: int | tuple[int, int],
18
+ scale: int,
19
+ collection: str,
20
+ ) -> pathlib.Path:
21
+ """
22
+ Generates a deterministic file path for caching query results.
23
+
24
+ Hashes the query parameters to create a unique filename. Coordinates
25
+ are rounded to 4 decimals to ensure cache hits on equivalent locations.
26
+
27
+ Args:
28
+ lon (float): Longitude of the center point.
29
+ lat (float): Latitude of the center point.
30
+ edge_size (int | Tuple[int, int]): Size of the ROI in pixels.
31
+ scale (int): Pixel resolution in meters.
32
+ collection (str): Earth Engine collection ID.
33
+
34
+ Returns:
35
+ pathlib.Path: Full path to the hashed .parquet cache file.
36
+ """
37
+ # Round coordinates to ~11m precision to group nearby requests
38
+ lon_r, lat_r = round(lon, 4), round(lat, 4)
39
+
40
+ # Normalize edge_size to tuple for consistent hashing
41
+ if isinstance(edge_size, int):
42
+ edge_tuple = (edge_size, edge_size)
43
+ else:
44
+ edge_tuple = edge_size
45
+
46
+ # Create a unique signature for this request configuration
47
+ signature = [lon_r, lat_r, edge_tuple, scale, collection]
48
+
49
+ # Use MD5 to generate a short, filesystem-friendly filename
50
+ raw = json.dumps(signature).encode("utf-8")
51
+ digest = hashlib.md5(raw).hexdigest()
52
+ return _CACHE_DIR / f"{digest}.parquet"
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime as dt
4
+ import sys
5
+ import time
6
+ import ee
7
+ import pandas as pd
8
+ from cubexpress.cache import _cache_key
9
+ import datetime as dt
10
+ from cubexpress.geospatial import _square_roi
11
+ import warnings
12
+ warnings.filterwarnings('ignore', category=DeprecationWarning)
13
+
14
+
15
+ # --- CONFIGURATION CONSTANTS ---
16
+ S2_COLLECTION = "COPERNICUS/S2_HARMONIZED"
17
+ S2_CLOUD_COLLECTION = "GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED"
18
+ S2_BANDS = [
19
+ "B1", "B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B9", "B10", "B11", "B12"
20
+ ]
21
+ S2_PIXEL_SCALE = 10 # meters
22
+ # -------------------------------
23
+
24
+ def _cloud_table_single_range(
25
+ lon: float,
26
+ lat: float,
27
+ edge_size: int | tuple[int, int],
28
+ start: str,
29
+ end: str
30
+ ) -> pd.DataFrame:
31
+ """
32
+ Build a daily cloud-score table for a square Sentinel-2 footprint.
33
+
34
+ Query Earth Engine for a specific date range, identifying which images
35
+ fully contain the ROI and filling missing cloud scores with daily means.
36
+
37
+ Args:
38
+ lon (float): Longitude of the center point.
39
+ lat (float): Latitude of the center point.
40
+ edge_size (int | tuple[int, int]): Side length of the square region
41
+ in Sentinel-2 pixels (10 m each).
42
+ start (str): ISO-8601 start date (inclusive), e.g. "2024-06-01".
43
+ end (str): ISO-8601 end date (inclusive).
44
+
45
+ Returns:
46
+ pd.DataFrame: A DataFrame with one row per image. Columns include:
47
+ * id: Sentinel-2 ID.
48
+ * cs_cdf: Cloud Score Plus CDF (0—1).
49
+ * date: Acquisition date (YYYY-MM-DD).
50
+ * inside: 1 if the image fully contains the ROI, 0 otherwise.
51
+
52
+ Note: Missing ``cs_cdf`` values are filled with the mean of the
53
+ same day if a full-coverage image is not available.
54
+
55
+ Raises:
56
+ ee.ee_exception.EEException: If Earth Engine fails for reasons other
57
+ than an empty collection (e.g., quota exceeded, bad request).
58
+ """
59
+ # Define ROI (bbox around point)
60
+ center = ee.Geometry.Point([lon, lat])
61
+ roi = _square_roi(lon, lat, edge_size, 10)
62
+
63
+ # Query S2
64
+ s2 = (
65
+ ee.ImageCollection(S2_COLLECTION)
66
+ .filterBounds(roi)
67
+ .filterDate(start, end)
68
+ )
69
+
70
+ # Cloud Score Plus collection
71
+ ic = (
72
+ s2
73
+ .linkCollection(
74
+ ee.ImageCollection(S2_CLOUD_COLLECTION),
75
+ ["cs_cdf"]
76
+ )
77
+ .select(["cs_cdf"])
78
+ )
79
+
80
+ # Identify images whose footprint contains the ROI
81
+ ids_inside = (
82
+ ic
83
+ .map(
84
+ lambda img: img.set(
85
+ 'roi_inside_scene',
86
+ img.geometry().contains(roi, maxError=10)
87
+ )
88
+ )
89
+ .filter(ee.Filter.eq('roi_inside_scene', True))
90
+ .aggregate_array('system:index')
91
+ .getInfo()
92
+ )
93
+
94
+ # Generate % cloud of each image over the ROI
95
+ try:
96
+ raw = ic.getRegion(
97
+ geometry=center,
98
+ scale=(edge_size) * 11 # 10 m pixels plus margin (it's a tricky calculation)
99
+ ).getInfo()
100
+ except ee.ee_exception.EEException as e:
101
+ if "No bands in collection" in str(e):
102
+ return pd.DataFrame(
103
+ columns=["id", "longitude", "latitude", "time", "cs_cdf", "inside"]
104
+ )
105
+ raise e
106
+
107
+ # Convert raw data to DataFrame
108
+ df_raw = (
109
+ pd.DataFrame(raw[1:], columns=raw[0])
110
+ .drop(columns=["longitude", "latitude"])
111
+ .assign(
112
+ date=lambda d: pd.to_datetime(d["id"].str[:8], format="%Y%m%d").dt.strftime("%Y-%m-%d")
113
+ )
114
+ )
115
+
116
+ # Mark images whose ROI is fully inside the scene
117
+ df_raw["inside"] = df_raw["id"].isin(set(ids_inside)).astype(int)
118
+
119
+ # Fill missing cloud scores with daily mean (mosaic approach)
120
+ df_raw['cs_cdf'] = df_raw.groupby('date').apply(
121
+ lambda group: group['cs_cdf'].transform(
122
+ lambda _: group[group['inside'] == 1]['cs_cdf'].iloc[0]
123
+ if (group['inside'] == 1).any()
124
+ else group['cs_cdf'].mean()
125
+ )
126
+ ).reset_index(drop=True)
127
+
128
+ return df_raw
129
+
130
+ def s2_table(
131
+ lon: float,
132
+ lat: float,
133
+ edge_size: int | tuple[int, int],
134
+ start: str,
135
+ end: str,
136
+ max_cscore: float = 1.0,
137
+ min_cscore: float = 0.0,
138
+ cache: bool = False
139
+ ) -> pd.DataFrame:
140
+ """
141
+ Build (and cache) a per-day cloud-table for the requested ROI.
142
+
143
+ The function checks an on-disk parquet cache keyed on location and
144
+ parameters. If parts of the requested date-range are missing, it fetches
145
+ only those gaps from Earth Engine, merges them, updates the cache, and
146
+ finally filters by cloud score thresholds.
147
+
148
+ Args:
149
+ lon (float): Longitude of the center point.
150
+ lat (float): Latitude of the center point.
151
+ edge_size (int | tuple[int, int]): Side length of the square region
152
+ in Sentinel-2 pixels (10 m each).
153
+ start (str): ISO-8601 start date, e.g. "2024-06-01".
154
+ end (str): ISO-8601 end date.
155
+ max_cscore (float, optional): Maximum allowed cloud score CDF (0.0 to 1.0).
156
+ Rows above this threshold are dropped. Defaults to 1.0.
157
+ min_cscore (float, optional): Minimum allowed cloud score CDF (0.0 to 1.0).
158
+ Defaults to 0.0.
159
+ cache (bool, optional): If True, enables on-disk parquet caching to
160
+ avoid re-fetching data for the same parameters. Defaults to False.
161
+
162
+ Returns:
163
+ pd.DataFrame: Filtered cloud table. The DataFrame contains useful
164
+ metadata in ``.attrs`` (bands, collection, scale, etc.) needed
165
+ for downstream functions.
166
+ """
167
+ cache_file = _cache_key(lon, lat, edge_size, S2_PIXEL_SCALE, S2_COLLECTION)
168
+
169
+ # Load cached data if present
170
+ if cache and cache_file.exists():
171
+ print("📂 Loading cached metadata...", end='', flush=True)
172
+ t0 = time.time()
173
+ df_cached = pd.read_parquet(cache_file)
174
+ have_idx = pd.to_datetime(df_cached["date"], errors="coerce").dropna()
175
+
176
+ cached_start = have_idx.min().date()
177
+ cached_end = have_idx.max().date()
178
+ elapsed = time.time() - t0
179
+
180
+ if (
181
+ dt.date.fromisoformat(start) >= cached_start
182
+ and dt.date.fromisoformat(end) <= cached_end
183
+ ):
184
+ print(f"\r✅ Loaded {len(df_cached)} images from cache ({elapsed:.2f}s)")
185
+ df_full = df_cached
186
+ else:
187
+ print(f"\r📂 Cache loaded ({len(df_cached)} images, {elapsed:.2f}s)")
188
+
189
+ # Identify missing segments and fetch only those.
190
+ print("⏳ Fetching missing date ranges...", end='', flush=True)
191
+ t0 = time.time()
192
+ df_new_parts = []
193
+
194
+ if dt.date.fromisoformat(start) < cached_start:
195
+ a1, b1 = start, cached_start.isoformat()
196
+ df_new_parts.append(
197
+ _cloud_table_single_range(
198
+ lon=lon,
199
+ lat=lat,
200
+ edge_size=edge_size,
201
+ start=a1,
202
+ end=b1
203
+ )
204
+ )
205
+ if dt.date.fromisoformat(end) > cached_end:
206
+ a2, b2 = cached_end.isoformat(), end
207
+ df_new_parts.append(
208
+ _cloud_table_single_range(
209
+ lon=lon,
210
+ lat=lat,
211
+ edge_size=edge_size,
212
+ start=a2,
213
+ end=b2
214
+ )
215
+ )
216
+ df_new_parts = [df for df in df_new_parts if not df.empty]
217
+
218
+ if df_new_parts:
219
+ df_new = pd.concat(df_new_parts, ignore_index=True)
220
+ elapsed = time.time() - t0
221
+ print(f"\r✅ Fetched {len(df_new)} new images ({elapsed:.2f}s) ")
222
+
223
+ df_full = (
224
+ pd.concat([df_cached, df_new], ignore_index=True)
225
+ .sort_values("date", kind="mergesort")
226
+ )
227
+ else:
228
+ elapsed = time.time() - t0
229
+ print(f"\r✅ No new images needed ({elapsed:.2f}s) ")
230
+ df_full = df_cached
231
+ else:
232
+ print("⏳ Querying Earth Engine metadata...", end='', flush=True)
233
+ t0 = time.time()
234
+ df_full = _cloud_table_single_range(
235
+ lon=lon,
236
+ lat=lat,
237
+ edge_size=edge_size,
238
+ start=start,
239
+ end=end
240
+ )
241
+ elapsed = time.time() - t0
242
+ n_images = len(df_full)
243
+ date_range = f"{start} to {end}"
244
+ print(f"\r✅ Retrieved {n_images} images from {date_range} ({elapsed:.2f}s)")
245
+
246
+ # Save cache
247
+ if cache:
248
+ df_full.to_parquet(cache_file, compression="zstd")
249
+
250
+ # Filter by cloud cover and requested date window
251
+ result = (
252
+ df_full.query("@start <= date <= @end")
253
+ .query("@min_cscore <= cs_cdf <= @max_cscore")
254
+ .reset_index(drop=True)
255
+ )
256
+
257
+ # Attach metadata for downstream helpers
258
+ result.attrs.update(
259
+ {
260
+ "lon": lon,
261
+ "lat": lat,
262
+ "edge_size": edge_size,
263
+ "scale": S2_PIXEL_SCALE,
264
+ "bands": S2_BANDS,
265
+ "collection": S2_COLLECTION
266
+ }
267
+ )
268
+ return result
@@ -0,0 +1,156 @@
1
+ import utm
2
+ from pyproj import CRS, Transformer
3
+ from cubexpress.geotyping import RasterTransform
4
+
5
+ def parse_edge_size(edge_size: int | tuple[int, int]) -> tuple[int, int]:
6
+ """
7
+ Parse edge_size input into (width, height) tuple.
8
+
9
+ Args:
10
+ edge_size: Size specification
11
+
12
+ Returns:
13
+ tuple[int, int]: (width, height) in pixels
14
+
15
+ Raises:
16
+ ValueError: If tuple length != 2 or values <= 0
17
+ """
18
+ if isinstance(edge_size, int):
19
+ if edge_size <= 0:
20
+ raise ValueError(f"edge_size must be positive, got {edge_size}")
21
+ return (edge_size, edge_size)
22
+ else:
23
+ if len(edge_size) != 2:
24
+ raise ValueError(f"edge_size tuple must have 2 elements, got {len(edge_size)}")
25
+ width, height = edge_size
26
+ if width <= 0 or height <= 0:
27
+ raise ValueError(f"edge_size values must be positive, got {edge_size}")
28
+ return (width, height)
29
+
30
+ def geo2utm(
31
+ lon: float,
32
+ lat: float
33
+ ) -> tuple[float, float, str]:
34
+ """
35
+ Converts latitude and longitude coordinates to UTM coordinates and returns the EPSG code.
36
+
37
+ Args:
38
+ lon (float): Longitude in decimal degrees.
39
+ lat (float): Latitude in decimal degrees.
40
+
41
+ Returns:
42
+ tuple[float, float, str]: UTM coordinates (x, y) in meters and EPSG code as string.
43
+
44
+ Raises:
45
+ utm.OutOfRangeError: If coordinates are outside valid UTM range.
46
+ """
47
+ x, y, zone, _ = utm.from_latlon(lat, lon)
48
+ epsg_code = f"326{zone:02d}" if lat >= 0 else f"327{zone:02d}"
49
+ return float(x), float(y), f"EPSG:{epsg_code}"
50
+
51
+
52
+ def lonlat2rt_utm_or_ups(
53
+ lon: float,
54
+ lat: float
55
+ ) -> tuple[float, float, str]:
56
+ """
57
+ Calculate UTM coordinates using pyproj (fallback for geo2utm).
58
+
59
+ Uses standard UTM zones for all latitudes, matching GEE behavior.
60
+ This method is more robust than the utm library and works globally.
61
+
62
+ Note:
63
+ UTM is designed for [-80°, 84°] but works globally with
64
+ acceptable distortions for small tiles.
65
+
66
+ Args:
67
+ lon (float): Longitude in decimal degrees.
68
+ lat (float): Latitude in decimal degrees.
69
+
70
+ Returns:
71
+ tuple[float, float, str]: UTM coordinates (x, y) in meters and EPSG code as string.
72
+ """
73
+ zone = int((lon + 180) // 6) + 1
74
+ epsg_code = 32600 + zone if lat >= 0 else 32700 + zone
75
+ crs = CRS.from_epsg(epsg_code)
76
+
77
+ to_xy = Transformer.from_crs(4326, crs, always_xy=True)
78
+ x, y = to_xy.transform(lon, lat)
79
+
80
+ return float(x), float(y), f"EPSG:{epsg_code}"
81
+
82
+
83
+ def lonlat2rt(
84
+ lon: float,
85
+ lat: float,
86
+ edge_size: int | tuple[int, int],
87
+ scale: int
88
+ ) -> RasterTransform:
89
+ """
90
+ Generates a ``RasterTransform`` for a given point by converting geographic (lon, lat) coordinates
91
+ to UTM projection and building the necessary geotransform metadata.
92
+
93
+ This function:
94
+ 1. Converts the input (lon, lat) to UTM coordinates using :func:`geo2utm`.
95
+ 2. If that fails (e.g., near poles), falls back to pyproj-based calculation.
96
+ 3. Defines the extent of the raster in UTM meters based on the specified dimensions
97
+ and ``scale`` (meters per pixel).
98
+ 4. Sets the Y-scale to be negative (``-scale``) because geospatial images typically consider
99
+ the origin at the top-left corner, resulting in a downward Y axis.
100
+
101
+ Args:
102
+ lon (float): Longitude in decimal degrees.
103
+ lat (float): Latitude in decimal degrees.
104
+ edge_size (int | tuple[int, int]): Size of the output raster.
105
+ If int, creates a square (width=height=edge_size).
106
+ If tuple, specifies (width, height) in pixels.
107
+ scale (int): Spatial resolution in meters per pixel.
108
+
109
+ Returns:
110
+ RasterTransform: A Pydantic model containing:
111
+ - ``crs``: The EPSG code in the form ``"EPSG:XYZ"``,
112
+ - ``geotransform``: A dictionary with the affine transform parameters,
113
+ - ``width`` and ``height``.
114
+
115
+ Examples:
116
+ Square raster:
117
+
118
+ >>> rt = cubexpress.lonlat2rt(
119
+ ... lon=-76.0, lat=40.0,
120
+ ... edge_size=512, scale=30
121
+ ... )
122
+ >>> print(rt.width, rt.height)
123
+ 512 512
124
+
125
+ Rectangular raster:
126
+
127
+ >>> rt = cubexpress.lonlat2rt(
128
+ ... lon=-76.0, lat=40.0,
129
+ ... edge_size=(1024, 512), scale=30
130
+ ... )
131
+ >>> print(rt.width, rt.height)
132
+ 1024 512
133
+ """
134
+ try:
135
+ x, y, crs = geo2utm(lon, lat)
136
+ except Exception:
137
+ x, y, crs = lonlat2rt_utm_or_ups(lon, lat)
138
+
139
+ # Parse edge_size
140
+ width, height = parse_edge_size(edge_size)
141
+
142
+ half_width = (width * scale) / 2
143
+ half_height = (height * scale) / 2
144
+
145
+ geotransform = dict(
146
+ scaleX=scale,
147
+ shearX=0,
148
+ translateX=x - half_width,
149
+ scaleY=-scale,
150
+ shearY=0,
151
+ translateY=y + half_height,
152
+ )
153
+
154
+ return RasterTransform(
155
+ crs=crs, geotransform=geotransform, width=width, height=height
156
+ )