cubexpress 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cubexpress might be problematic. Click here for more details.

cubexpress/__init__.py CHANGED
@@ -1,15 +1,22 @@
1
- from cubexpress.conversion import lonlat2rt
2
- from cubexpress.download import getcube, getGeoTIFF
1
+ from cubexpress.conversion import lonlat2rt, geo2utm
3
2
  from cubexpress.geotyping import RasterTransform, Request, RequestSet
3
+ from cubexpress.cloud_utils import cloud_table
4
+ from cubexpress.cube import get_cube
5
+ from cubexpress.request import table_to_requestset
4
6
 
7
+
8
+
9
+ # pyproj
5
10
  # Export the functions
6
11
  __all__ = [
7
12
  "lonlat2rt",
8
13
  "RasterTransform",
9
14
  "Request",
10
15
  "RequestSet",
11
- "getcube",
12
- "getGeoTIFF",
16
+ "geo2utm",
17
+ "get_cube",
18
+ "cloud_table",
19
+ "table_to_requestset"
13
20
  ]
14
21
 
15
22
  # Dynamic version import
cubexpress/cache.py ADDED
@@ -0,0 +1,50 @@
1
+ """Simple file-based cache helpers for cloud_table results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ import os
8
+ import pathlib
9
+ from typing import Final
10
+
11
+ # Folder where per-location parquet files are stored.
12
+ _CACHE_DIR: Final[pathlib.Path] = pathlib.Path(
13
+ os.getenv("CUBEXPRESS_CACHE", "~/.cubexpress_cache")
14
+ ).expanduser()
15
+ _CACHE_DIR.mkdir(exist_ok=True)
16
+
17
+
18
+ def _cache_key(
19
+ lon: float,
20
+ lat: float,
21
+ edge_size: int,
22
+ scale: int,
23
+ collection: str,
24
+ ) -> pathlib.Path:
25
+ """Return deterministic parquet path for the given query parameters.
26
+
27
+ A 128-bit MD5 hash of the rounded coordinates, edge size, scale and
28
+ collection is used as file name to avoid overly long paths and ensure
29
+ uniqueness.
30
+
31
+ Parameters
32
+ ----------
33
+ lon, lat
34
+ Centre coordinates in decimal degrees; rounded to 4 dp (≈ 11 m).
35
+ edge_size
36
+ Edge length in pixels of the requested square ROI.
37
+ scale
38
+ Pixel size in metres.
39
+ collection
40
+ EE collection name (e.g. ``"COPERNICUS/S2_HARMONIZED"``).
41
+
42
+ Returns
43
+ -------
44
+ pathlib.Path
45
+ Absolute path ending in ``.parquet`` under ``_CACHE_DIR``.
46
+ """
47
+ lon_r, lat_r = round(lon, 4), round(lat, 4)
48
+ raw = json.dumps([lon_r, lat_r, edge_size, scale, collection]).encode()
49
+ digest = hashlib.md5(raw).hexdigest() # noqa: S324 – non-cryptographic OK
50
+ return _CACHE_DIR / f"{digest}.parquet"
@@ -0,0 +1,256 @@
1
+ """Cloud-coverage tables for Sentinel-2 over a square ROI.
2
+
3
+ Two helpers are exposed:
4
+
5
+ * :func:`_cloud_table_single_range` – query Earth Engine for one date-range.
6
+ * :func:`cloud_table` – smart wrapper that adds on-disk caching, automatic
7
+ back-filling, and cloud-percentage filtering.
8
+
9
+ Both return a ``pandas.DataFrame`` with the columns **day**, **cloudPct** and
10
+ **images** plus useful ``.attrs`` metadata for downstream functions.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import datetime as dt
16
+ import json
17
+ import pathlib
18
+ from typing import List, Optional
19
+
20
+ import ee
21
+ import pandas as pd
22
+
23
+ from cubexpress.cache import _cache_key
24
+ from cubexpress.geospatial import _square_roi
25
+
26
+
27
+ def _cloud_table_single_range(
28
+ lon: float,
29
+ lat: float,
30
+ edge_size: int,
31
+ scale: int,
32
+ start: str,
33
+ end: str,
34
+ collection: str = "COPERNICUS/S2_HARMONIZED",
35
+ ) -> pd.DataFrame:
36
+ """Return raw cloud-table rows for a single *start–end* interval.
37
+
38
+ Parameters
39
+ ----------
40
+ lon, lat
41
+ Centre coordinates in decimal degrees.
42
+ edge_size, scale
43
+ ROI size in pixels (*edge_size*) and pixel resolution in metres
44
+ (*scale*), fed into :pyfunc:`cubexpress.geospatial._square_roi`.
45
+ start, end
46
+ ISO-dates (``YYYY-MM-DD``) delimiting the query.
47
+ collection
48
+ Sentinel-2 collection name to query.
49
+
50
+ Returns
51
+ -------
52
+ pandas.DataFrame
53
+ Columns: **day** (str), **cloudPct** (float), **images** (str
54
+ concatenation of asset IDs separated by ``-``). No filtering applied.
55
+ """
56
+ roi = _square_roi(lon, lat, edge_size, scale)
57
+ s2 = ee.ImageCollection(collection)
58
+
59
+ if collection in (
60
+ "COPERNICUS/S2_HARMONIZED",
61
+ "COPERNICUS/S2_SR_HARMONIZED",
62
+ ):
63
+ qa_band = "cs_cdf"
64
+ csp = ee.ImageCollection("GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED")
65
+ else:
66
+ qa_band, csp = None, None
67
+
68
+ def _add_props(img):
69
+ day = ee.Date(img.get("system:time_start")).format("YYYY-MM-dd")
70
+ imgid = img.get("system:index")
71
+
72
+ if qa_band:
73
+ score = (
74
+ img.linkCollection(csp, [qa_band])
75
+ .select([qa_band])
76
+ .reduceRegion(ee.Reducer.mean(), roi, scale)
77
+ .get(qa_band)
78
+ )
79
+ # If score is null assume completely clear (score=1 → cloudPct=0)
80
+ score_safe = ee.Algorithms.If(score, score, -1)
81
+ cloud_pct = (
82
+ ee.Number(1)
83
+ .subtract(ee.Number(score_safe))
84
+ .multiply(10000)
85
+ .round()
86
+ .divide(100)
87
+ )
88
+ else:
89
+ cloud_pct = ee.Number(-1)
90
+
91
+ return ee.Feature(
92
+ None,
93
+ {
94
+ "day": day,
95
+ "cloudPct": cloud_pct,
96
+ "images": imgid,
97
+ },
98
+ )
99
+
100
+ triples = (
101
+ s2.filterDate(start, end)
102
+ .filterBounds(roi)
103
+ .map(_add_props)
104
+ .reduceColumns(ee.Reducer.toList(3), ["day", "cloudPct", "images"])
105
+ .get("list")
106
+ .getInfo()
107
+ )
108
+
109
+ df = pd.DataFrame(triples, columns=["day", "cloudPct", "images"]).dropna()
110
+ df["cloudPct"] = df["cloudPct"].astype(float)
111
+ df["images"] = df["images"].astype(str)
112
+ return df
113
+
114
+
115
+ def cloud_table(
116
+ lon: float,
117
+ lat: float,
118
+ edge_size: int = 2048,
119
+ scale: int = 10,
120
+ start: str = "2017-01-01",
121
+ end: str = "2024-12-31",
122
+ cloud_max: float = 7.0,
123
+ bands: Optional[List[str]] = None,
124
+ collection: str = "COPERNICUS/S2_HARMONIZED",
125
+ output_path: str | pathlib.Path | None = None,
126
+ cache: bool = True,
127
+ verbose: bool = True,
128
+ ) -> pd.DataFrame:
129
+ """Build (and cache) a per-day cloud-table for the requested ROI.
130
+
131
+ The function first checks an on-disk parquet cache keyed on location and
132
+ parameters. If parts of the requested date-range are missing, it fetches
133
+ only those gaps from Earth Engine, merges, updates the cache and finally
134
+ filters by *cloud_max*.
135
+
136
+ Parameters
137
+ ----------
138
+ lon, lat
139
+ Centre coordinates.
140
+ edge_size, scale
141
+ Square size (pixels) and resolution (metres).
142
+ start, end
143
+ ISO start/end dates.
144
+ cloud_max
145
+ Maximum allowed cloud percentage (0-100). Rows above this threshold are
146
+ dropped.
147
+ bands
148
+ List of spectral bands to embed as metadata. If *None* the full
149
+ Sentinel-2 set is used.
150
+ collection
151
+ Sentinel-2 collection to query.
152
+ output_path
153
+ Downstream path hint stored in ``result.attrs``; not used internally.
154
+ cache
155
+ Toggle parquet caching.
156
+ verbose
157
+ If *True* prints cache info/progress.
158
+
159
+ Returns
160
+ -------
161
+ pandas.DataFrame
162
+ Filtered cloud table with ``.attrs`` containing the call parameters.
163
+ """
164
+ if bands is None:
165
+ bands = [
166
+ "B1",
167
+ "B2",
168
+ "B3",
169
+ "B4",
170
+ "B5",
171
+ "B6",
172
+ "B7",
173
+ "B8",
174
+ "B8A",
175
+ "B9",
176
+ "B10",
177
+ "B11",
178
+ "B12",
179
+ ]
180
+
181
+ cache_file = _cache_key(lon, lat, edge_size, scale, collection)
182
+
183
+ # ─── 1. Load cached data if present ────────────────────────────────────
184
+ if cache and cache_file.exists():
185
+ if verbose:
186
+ print("📂 Loading cached table …")
187
+ df_cached = pd.read_parquet(cache_file)
188
+ have_idx = pd.to_datetime(df_cached["day"], errors="coerce").dropna()
189
+
190
+ cached_start = have_idx.min().date()
191
+ cached_end = have_idx.max().date()
192
+
193
+ if (
194
+ dt.date.fromisoformat(start) >= cached_start
195
+ and dt.date.fromisoformat(end) <= cached_end
196
+ ):
197
+ if verbose:
198
+ print("✅ Served entirely from cache.")
199
+ df_full = df_cached
200
+ else:
201
+ # Identify missing segments and fetch only those.
202
+ df_new_parts = []
203
+ if dt.date.fromisoformat(start) < cached_start:
204
+ a1, b1 = start, cached_start.isoformat()
205
+ df_new_parts.append(
206
+ _cloud_table_single_range(
207
+ lon, lat, edge_size, scale, a1, b1, collection
208
+ )
209
+ )
210
+ if dt.date.fromisoformat(end) > cached_end:
211
+ a2, b2 = cached_end.isoformat(), end
212
+ df_new_parts.append(
213
+ _cloud_table_single_range(
214
+ lon, lat, edge_size, scale, a2, b2, collection
215
+ )
216
+ )
217
+ df_new = pd.concat(df_new_parts, ignore_index=True)
218
+ df_full = (
219
+ pd.concat([df_cached, df_new], ignore_index=True)
220
+ .drop_duplicates("day")
221
+ .sort_values("day", kind="mergesort")
222
+ )
223
+ else:
224
+ # No cache or caching disabled: fetch full range.
225
+ if verbose:
226
+ msg = "Generating table (no cache found)…" if cache else "Generating table…"
227
+ print("⏳", msg)
228
+ df_full = _cloud_table_single_range(
229
+ lon, lat, edge_size, scale, start, end, collection
230
+ )
231
+
232
+ # ─── 2. Save cache ─────────────────────────────────────────────────────
233
+ if cache:
234
+ df_full.to_parquet(cache_file, compression="zstd")
235
+
236
+ # ─── 3. Filter by cloud cover and requested date window ────────────────
237
+ result = (
238
+ df_full.query("@start <= day <= @end")
239
+ .query("cloudPct < @cloud_max")
240
+ .reset_index(drop=True)
241
+ )
242
+
243
+ # Attach metadata for downstream helpers
244
+ result.attrs.update(
245
+ {
246
+ "lon": lon,
247
+ "lat": lat,
248
+ "edge_size": edge_size,
249
+ "scale": scale,
250
+ "bands": bands,
251
+ "collection": collection,
252
+ "cloud_max": cloud_max,
253
+ "output_path": str(output_path) if output_path else "",
254
+ }
255
+ )
256
+ return result
cubexpress/conversion.py CHANGED
@@ -19,7 +19,7 @@ def geo2utm(lon: float, lat: float) -> tuple[float, float, str]:
19
19
  """
20
20
  x, y, zone, _ = utm.from_latlon(lat, lon)
21
21
  epsg_code = f"326{zone:02d}" if lat >= 0 else f"327{zone:02d}"
22
- return x, y, f"EPSG:{epsg_code}"
22
+ return float(x), float(y), f"EPSG:{epsg_code}"
23
23
 
24
24
 
25
25
  def lonlat2rt(lon: float, lat: float, edge_size: int, scale: int) -> RasterTransform:
cubexpress/cube.py ADDED
@@ -0,0 +1,84 @@
1
+ """High-level helpers for tiled GeoTIFF downloads.
2
+
3
+ The module provides two thread-friendly wrappers:
4
+
5
+ * **get_geotiff** – download a single manifest, auto-tiling on EE pixel-count
6
+ errors.
7
+ * **get_cube** – iterate over a ``RequestSet`` (or similar) and build a local
8
+ raster “cube” in parallel.
9
+
10
+ The core download/split logic lives in *cubexpress.downloader* and
11
+ *cubexpress.geospatial*; here we merely orchestrate it.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import pathlib
17
+ import concurrent.futures
18
+ from typing import Dict, Any
19
+
20
+ import ee
21
+ from cubexpress.downloader import download_manifest, download_manifests
22
+ from cubexpress.geospatial import quadsplit_manifest, calculate_cell_size
23
+ from cubexpress.geotyping import RequestSet
24
+
25
+
26
+ def get_geotiff(
27
+ manifest: Dict[str, Any],
28
+ full_outname: pathlib.Path | str,
29
+ nworks: int = 4,
30
+ ) -> None:
31
+ """Download *manifest* to *full_outname*, retrying with tiled requests.
32
+
33
+ Parameters
34
+ ----------
35
+ manifest
36
+ Earth Engine download manifest returned by cubexpress.
37
+ full_outname
38
+ Final ``.tif`` path (created/overwritten).
39
+ nworks
40
+ Maximum worker threads when the image must be split; default **4**.
41
+ """
42
+ try:
43
+ download_manifest(manifest, pathlib.Path(full_outname))
44
+ except ee.ee_exception.EEException as err:
45
+ # Handle EE “too many pixels” error by recursive tiling.
46
+ size = manifest["grid"]["dimensions"]["width"] # square images assumed
47
+ cell_w, cell_h, power = calculate_cell_size(str(err), size)
48
+ tiled = quadsplit_manifest(manifest, cell_w, cell_h, power)
49
+ download_manifests(tiled, max_workers=nworks, full_outname=pathlib.Path(full_outname))
50
+
51
+
52
+ def get_cube(
53
+ requests: RequestSet,
54
+ outfolder: pathlib.Path | str,
55
+ nworks: int = 4,
56
+ ) -> None:
57
+ """Download every request in *requests* to *outfolder* using a thread pool.
58
+
59
+ Each row in ``requests._dataframe`` must expose ``manifest`` and ``id``.
60
+ Resulting files are named ``{id}.tif``.
61
+
62
+ Parameters
63
+ ----------
64
+ requests
65
+ A ``RequestSet`` or object with an internal ``_dataframe`` attribute.
66
+ outfolder
67
+ Folder where the GeoTIFFs will be written (created if absent).
68
+ nworks
69
+ Pool size for concurrent downloads; default **4**.
70
+ """
71
+ out = pathlib.Path(outfolder)
72
+
73
+ with concurrent.futures.ThreadPoolExecutor(max_workers=nworks) as pool:
74
+ futures = []
75
+ for _, row in requests._dataframe.iterrows():
76
+ outname = out / f"{row.id}.tif"
77
+ outname.parent.mkdir(parents=True, exist_ok=True)
78
+ futures.append(pool.submit(get_geotiff, row.manifest, outname, nworks))
79
+
80
+ for fut in concurrent.futures.as_completed(futures):
81
+ try:
82
+ fut.result()
83
+ except Exception as exc: # noqa: BLE001 – log and keep going
84
+ print(f"Download error: {exc}")
@@ -0,0 +1,95 @@
1
+ """Low-level download helpers for Earth Engine manifests.
2
+
3
+ Only two public callables are exposed:
4
+
5
+ * :func:`download_manifest` – fetch a single manifest and write one GeoTIFF.
6
+ * :func:`download_manifests` – convenience wrapper to parallel-download a list
7
+ of manifests with a thread pool.
8
+
9
+ Both functions are fully I/O bound; no return value is expected.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import pathlib
16
+ import concurrent.futures
17
+ from copy import deepcopy
18
+ from typing import Any, Dict, List
19
+
20
+ import ee
21
+ import rasterio as rio
22
+ from rasterio.io import MemoryFile
23
+
24
+
25
+ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None:
26
+ """Download *ulist* and save it as *full_outname*.
27
+
28
+ The manifest must include either an ``assetId`` or an ``expression``
29
+ (serialized EE image). RasterIO is used to write a tiled, compressed
30
+ GeoTIFF; the function is silent apart from the final ``print``.
31
+ """
32
+ if "assetId" in ulist:
33
+ images_bytes = ee.data.getPixels(ulist)
34
+ elif "expression" in ulist:
35
+ ee_image = ee.deserializer.decode(json.loads(ulist["expression"]))
36
+ ulist_deep = deepcopy(ulist)
37
+ ulist_deep["expression"] = ee_image
38
+ images_bytes = ee.data.computePixels(ulist_deep)
39
+ else: # pragma: no cover
40
+ raise ValueError("Manifest does not contain 'assetId' or 'expression'")
41
+
42
+ with MemoryFile(images_bytes) as memfile:
43
+ with memfile.open() as src:
44
+ profile = src.profile
45
+ profile.update(
46
+ {
47
+ "driver": "Gtiff",
48
+ "tiled": "yes",
49
+ "interleave": "band",
50
+ "blockxsize": 256,
51
+ "blockysize": 256,
52
+ "compress": "ZSTD",
53
+ "predictor": 2,
54
+ "num_threads": 20,
55
+ "nodata": 65535,
56
+ "dtype": "uint16",
57
+ "count": 13,
58
+ "lztd_level": 13,
59
+ "copy_src_overviews": True,
60
+ "overviews": "AUTO",
61
+ }
62
+ )
63
+ all_bands = src.read()
64
+
65
+ with rio.open(full_outname, "w", **profile) as dst:
66
+ dst.write(all_bands)
67
+
68
+ print(f"{full_outname} downloaded successfully.") # noqa: T201
69
+
70
+
71
+ def download_manifests(
72
+ manifests: List[Dict[str, Any]],
73
+ max_workers: int,
74
+ full_outname: pathlib.Path,
75
+ ) -> None:
76
+ """Download every manifest in *manifests* concurrently.
77
+
78
+ Each output file is saved in the folder
79
+ ``full_outname.parent/full_outname.stem`` with names ``000000.tif``,
80
+ ``000001.tif`` … according to the list order.
81
+ """
82
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
83
+ futures = []
84
+
85
+ for index, umanifest in enumerate(manifests):
86
+ folder = full_outname.parent / full_outname.stem
87
+ folder.mkdir(parents=True, exist_ok=True)
88
+ outname = folder / f"{index:06d}.tif"
89
+ futures.append(executor.submit(download_manifest, umanifest, outname))
90
+
91
+ for fut in concurrent.futures.as_completed(futures):
92
+ try:
93
+ fut.result()
94
+ except Exception as exc: # noqa: BLE001
95
+ print(f"Error en una de las descargas: {exc}") # noqa: T201
@@ -0,0 +1,55 @@
1
+ import ee
2
+ import re
3
+ from copy import deepcopy
4
+ from typing import Dict
5
+
6
+
7
+ def quadsplit_manifest(manifest: Dict, cell_width: int, cell_height: int, power: int) -> list[Dict]:
8
+ manifest_copy = deepcopy(manifest)
9
+
10
+ manifest_copy["grid"]["dimensions"]["width"] = cell_width
11
+ manifest_copy["grid"]["dimensions"]["height"] = cell_height
12
+ x = manifest_copy["grid"]["affineTransform"]["translateX"]
13
+ y = manifest_copy["grid"]["affineTransform"]["translateY"]
14
+ scale_x = manifest_copy["grid"]["affineTransform"]["scaleX"]
15
+ scale_y = manifest_copy["grid"]["affineTransform"]["scaleY"]
16
+
17
+ manifests = []
18
+
19
+ for columny in range(2**power):
20
+ for rowx in range(2**power):
21
+ new_x = x + (rowx * cell_width) * scale_x
22
+ new_y = y + (columny * cell_height) * scale_y
23
+ new_manifest = deepcopy(manifest_copy)
24
+ new_manifest["grid"]["affineTransform"]["translateX"] = new_x
25
+ new_manifest["grid"]["affineTransform"]["translateY"] = new_y
26
+ manifests.append(new_manifest)
27
+
28
+ return manifests
29
+
30
+
31
+
32
+ def calculate_cell_size(ee_error_message: str, size: int) -> tuple[int, int]:
33
+ match = re.findall(r'\d+', ee_error_message)
34
+ image_pixel = int(match[0])
35
+ max_pixel = int(match[1])
36
+
37
+ images = image_pixel / max_pixel
38
+ power = 0
39
+
40
+ while images > 1:
41
+ power += 1
42
+ images = image_pixel / (max_pixel * 4 ** power)
43
+
44
+ cell_width = size // 2 ** power
45
+ cell_height = size // 2 ** power
46
+
47
+ return cell_width, cell_height, power
48
+
49
+
50
+
51
+ def _square_roi(lon: float, lat: float, edge_size: int, scale: int) -> ee.Geometry:
52
+ """Return a square `ee.Geometry` centred on (*lon*, *lat*)."""
53
+ half = edge_size * scale / 2
54
+ point = ee.Geometry.Point([lon, lat])
55
+ return point.buffer(half).bounds()
cubexpress/geotyping.py CHANGED
@@ -282,7 +282,7 @@ class RequestSet(BaseModel):
282
282
  return pd.DataFrame(
283
283
  [
284
284
  {
285
- "id": meta.id,
285
+ "id": meta.id, # add clud
286
286
  "lon": lon[index],
287
287
  "lat": lat[index],
288
288
  "x": x[index],
cubexpress/request.py ADDED
@@ -0,0 +1,77 @@
1
+ """Convert cloud_table output into a RequestSet."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ee
6
+ import pandas as pd
7
+ import pygeohash as pgh
8
+ from typing import List
9
+
10
+ from cubexpress.geotyping import Request, RequestSet
11
+ from cubexpress.conversion import lonlat2rt
12
+
13
+
14
+ def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
15
+ """Return a :class:`RequestSet` built from *df* (cloud_table result).
16
+
17
+ Parameters
18
+ ----------
19
+ df
20
+ DataFrame with *day* and *images* columns plus attrs created by
21
+ :pyfunc:`cubexpress.cloud_table`.
22
+ mosaic
23
+ If ``True`` a single mosaic per day is requested; otherwise each
24
+ individual asset becomes its own request.
25
+
26
+ Raises
27
+ ------
28
+ ValueError
29
+ If *df* is empty after filtering.
30
+
31
+ """
32
+ if df.empty:
33
+ raise ValueError("cloud_table returned no rows; nothing to request.")
34
+
35
+ rt = lonlat2rt(
36
+ lon=df.attrs["lon"],
37
+ lat=df.attrs["lat"],
38
+ edge_size=df.attrs["edge_size"],
39
+ scale=df.attrs["scale"],
40
+ )
41
+ centre_hash = pgh.encode(df.attrs["lat"], df.attrs["lon"], precision=5)
42
+ reqs: List[Request] = []
43
+
44
+ if mosaic:
45
+ # group all asset IDs per day
46
+ grouped = (
47
+ df.assign(img=lambda x: x.images.str.split("-"))
48
+ .explode("img")
49
+ .groupby("day")["img"]
50
+ .apply(list)
51
+ )
52
+
53
+ for day, img_ids in grouped.items():
54
+ ee_img = ee.ImageCollection(
55
+ [ee.Image(f"{df.attrs['collection']}/{img}") for img in img_ids]
56
+ ).mosaic()
57
+ reqs.append(
58
+ Request(
59
+ id=f"{day}_{centre_hash}_mosaic",
60
+ raster_transform=rt,
61
+ image=ee_img,
62
+ bands=df.attrs["bands"],
63
+ )
64
+ )
65
+ else: # one request per asset
66
+ for _, row in df.iterrows():
67
+ for img_id in row["images"].split("-"):
68
+ reqs.append(
69
+ Request(
70
+ id=f"{row['day']}_{centre_hash}_{img_id}",
71
+ raster_transform=rt,
72
+ image=f"{df.attrs['collection']}/{img_id}",
73
+ bands=df.attrs["bands"],
74
+ )
75
+ )
76
+
77
+ return RequestSet(requestset=reqs)
@@ -1,9 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cubexpress
3
- Version: 0.1.0
4
- Summary: A Python package for efficient processing of cubic earth observation (EO) data
5
- Home-page: https://github.com/andesdatacube/cubexpress/
3
+ Version: 0.1.1
4
+ Summary: Efficient processing of cubic Earth-observation (EO) data.
5
+ Home-page: https://github.com/andesdatacube/cubexpress
6
6
  License: MIT
7
+ Keywords: earth-engine,sentinel-2,geospatial,eo,cube
7
8
  Author: Julio Contreras
8
9
  Author-email: contrerasnetk@gmail.com
9
10
  Requires-Python: >=3.9,<4.0
@@ -13,11 +14,18 @@ Classifier: Programming Language :: Python :: 3.9
13
14
  Classifier: Programming Language :: Python :: 3.10
14
15
  Classifier: Programming Language :: Python :: 3.11
15
16
  Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3 :: Only
18
+ Classifier: Topic :: Scientific/Engineering :: GIS
19
+ Provides-Extra: full
20
+ Requires-Dist: earthengine-api (>=0.1.392) ; extra == "full"
16
21
  Requires-Dist: numpy (>=1.25.2)
17
22
  Requires-Dist: pandas (>=2.0.3)
23
+ Requires-Dist: pyarrow (>=14.0.0) ; extra == "full"
24
+ Requires-Dist: pygeohash (>=1.2.0,<2.0.0)
25
+ Requires-Dist: rasterio (>=1.3.9) ; extra == "full"
18
26
  Requires-Dist: utm (>=0.8.0,<0.9.0)
19
- Project-URL: Documentation, https://andesdatacube.github.io/cubexpress/
20
- Project-URL: Repository, https://github.com/andesdatacube/cubexpress/
27
+ Project-URL: Documentation, https://andesdatacube.github.io/cubexpress
28
+ Project-URL: Repository, https://github.com/andesdatacube/cubexpress
21
29
  Description-Content-Type: text/markdown
22
30
 
23
31
  <h1></h1>
@@ -0,0 +1,13 @@
1
+ cubexpress/__init__.py,sha256=ybNczt27OVUkT8WE8v0-A9hByKtfNsNysg-V8CnZqxE,564
2
+ cubexpress/cache.py,sha256=EZiR2AJfplaLpqMIVFb5piCAgFqHKF1vgLIrutfz8tA,1425
3
+ cubexpress/cloud_utils.py,sha256=O4qcl8kr0_Yv0giZ-h4uzf791d1_z9HZy1Br8N867iA,8102
4
+ cubexpress/conversion.py,sha256=JSaMnswY-2n5E4H2zxb-oEOTJ8UPzXfMeSVCremtvTw,2520
5
+ cubexpress/cube.py,sha256=1GPVAt5Q0vGqA3QJ4gixAevlosV4JHNKMzN1PirhawI,2911
6
+ cubexpress/downloader.py,sha256=u0u1LG2DOIaEvCPDIAaJDLH5_od52D1OPCbcpCicRzY,3320
7
+ cubexpress/geospatial.py,sha256=ZbsPIgsYQFnNFXUuQ136rJsL4b2Bf91o0Vsswby2dFc,1812
8
+ cubexpress/geotyping.py,sha256=6hjzjZhg6jRYRhLMQ_IiBygnShWlRCtpIbf6rRaQQ7s,17163
9
+ cubexpress/request.py,sha256=cRm0J6Um8wCkbMDYBv9eCiqv32hLH28EH4eHLLDsJ-c,2333
10
+ cubexpress-0.1.1.dist-info/LICENSE,sha256=XjoS-d76b7Cl-VgCWhQk83tNf2dNldKBN8SrImwGc2Q,1072
11
+ cubexpress-0.1.1.dist-info/METADATA,sha256=3DhrMNjKWIfImjQgfCVRfade1JXcX2acDJX4iPtwR4U,9692
12
+ cubexpress-0.1.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
13
+ cubexpress-0.1.1.dist-info/RECORD,,
cubexpress/download.py DELETED
@@ -1,347 +0,0 @@
1
- import concurrent.futures
2
- import json
3
- import pathlib
4
- from concurrent.futures import ThreadPoolExecutor
5
- from copy import deepcopy
6
- from typing import Optional
7
-
8
- import ee
9
- import numpy as np
10
- import pandas as pd
11
-
12
- from cubexpress.geotyping import RequestSet
13
-
14
-
15
- def check_not_found_error(error_message: str) -> bool:
16
- """
17
- Checks if the error message indicates that the image was not found.
18
-
19
- Args:
20
- error_message (str): The error message to check.
21
-
22
- Returns:
23
- bool: True if the error message indicates "not found", False otherwise.
24
-
25
- Example:
26
- >>> check_not_found_error("Total request size must be less than or equal to...")
27
- True
28
- """
29
- return (
30
- "Total request size" in error_message
31
- and "must be less than or equal to" in error_message
32
- )
33
-
34
-
35
- def quadsplit_manifest(manifest: dict) -> list[dict]:
36
- """
37
- Splits a manifest into four smaller ones by dividing the grid dimensions.
38
-
39
- Args:
40
- manifest (dict): The original manifest to split.
41
-
42
- Returns:
43
- List[dict]: A list of four smaller manifests with updated grid transformations.
44
-
45
- Example:
46
- >>> manifest = {'grid': {'dimensions': {'width': 100, 'height': 100}, 'affineTransform': {'scaleX': 0.1, 'scaleY': 0.1, 'translateX': 0, 'translateY': 0}}}
47
- >>> quadsplit_manifest(manifest)
48
- [{'grid': {'dimensions': {'width': 50, 'height': 50}, 'affineTransform': {'scaleX': 0.1, 'scaleY': 0.1, 'translateX': 0, 'translateY': 0}}}, {'grid': {'dimensions': {'width': 50, 'height': 50}, 'affineTransform': {'scaleX': 0.1, 'scaleY': 0.1, 'translateX': 5.0, 'translateY': 0}}}, ...]
49
- """
50
- manifest_copy = deepcopy(manifest)
51
- new_width = manifest["grid"]["dimensions"]["width"] // 2
52
- new_height = manifest["grid"]["dimensions"]["height"] // 2
53
- manifest_copy["grid"]["dimensions"]["width"] = new_width
54
- manifest_copy["grid"]["dimensions"]["height"] = new_height
55
-
56
- manifests = []
57
- for idx in range(4):
58
- new_manifest = deepcopy(manifest_copy)
59
- res_x = manifest["grid"]["affineTransform"]["scaleX"]
60
- res_y = manifest["grid"]["affineTransform"]["scaleY"]
61
-
62
- add_x, add_y = (0, 0)
63
- if idx == 1:
64
- add_x = new_width * res_x
65
- elif idx == 2:
66
- add_y = new_height * res_y
67
- elif idx == 3:
68
- add_x = new_width * res_x
69
- add_y = new_height * res_y
70
-
71
- new_manifest["grid"]["affineTransform"]["translateX"] += add_x
72
- new_manifest["grid"]["affineTransform"]["translateY"] += add_y
73
-
74
- manifests.append(new_manifest)
75
-
76
- return manifests
77
-
78
-
79
- def getGeoTIFFbatch(
80
- manifest_dict: dict,
81
- full_outname: pathlib.Path,
82
- max_deep_level: Optional[int] = 5,
83
- method: Optional[str] = "getPixels",
84
- ) -> Optional[np.ndarray]:
85
- """
86
- Downloads a GeoTIFF image from Google Earth Engine using either the `getPixels` or `computePixels` method.
87
- If the requested area exceeds the size limit, the image is recursively split into smaller tiles until the
88
- download succeeds or the maximum recursion depth is reached.
89
-
90
- Args:
91
- manifest_dict (dict): A dictionary containing image metadata, including grid dimensions, affine transformations,
92
- and either an `assetId` or `expression` for the image source.
93
- full_outname (pathlib.Path): The full path where the downloaded GeoTIFF file will be saved.
94
- max_deep_level (Optional[int]): Maximum recursion depth for splitting large requests. Defaults to 5.
95
- method (Optional[str]): Method for retrieving image data. Can be 'getPixels' for asset-based requests or
96
- 'computePixels' for expressions. Defaults to 'getPixels'.
97
-
98
- Returns:
99
- Optional[pathlib.Path]: The path to the downloaded GeoTIFF file. Returns `None` if the download fails.
100
-
101
- Raises:
102
- ValueError: If the method is not 'getPixels' or 'computePixels', or if the image cannot be found.
103
-
104
- Example:
105
- >>> import ee
106
- >>> import pathlib
107
- >>> ee.Initialize()
108
- >>> manifest_dict = {
109
- ... "assetId": "COPERNICUS/S2_HARMONIZED/20160816T153912_20160816T154443_T18TYN",
110
- ... "fileFormat": "GEO_TIFF",
111
- ... "bandIds": ["B4", "B3", "B2"],
112
- ... "grid": {
113
- ... "dimensions": {
114
- ... "width": 512,
115
- ... "height": 512
116
- ... },
117
- ... "affineTransform": {
118
- ... "scaleX": 10,
119
- ... "shearX": 0,
120
- ... "translateX": 725260.108545126,
121
- ... "scaleY": -10,
122
- ... "shearY": 0,
123
- ... "translateY": 4701550.38712196
124
- ... },
125
- ... "crsCode": "EPSG:32618"
126
- ... }
127
- ... }
128
-
129
- >>> getGeoTIFFbatch(manifest_dict pathlib.Path('output/sentinel_image.tif'))
130
- PosixPath('output/sentinel_image.tif')
131
- """
132
-
133
- # Check if the maximum recursion depth has been reached
134
- if max_deep_level == 0:
135
- raise ValueError("Max recursion depth reached.")
136
-
137
- try:
138
- # Get the image bytes
139
- if method == "getPixels":
140
- image_bytes: bytes = ee.data.getPixels(manifest_dict)
141
- elif method == "computePixels":
142
- image_bytes: bytes = ee.data.computePixels(manifest_dict)
143
- else:
144
- raise ValueError("Method must be either 'getPixels' or 'computePixels'")
145
-
146
- # Write the image bytes to a file
147
- with open(full_outname, "wb") as src:
148
- src.write(image_bytes)
149
- except Exception as e:
150
- # TODO: This is a workaround when the image is not found, as it is a message from the server
151
- # it is not possible to check the type of the exception
152
- if not check_not_found_error(str(e)):
153
- raise ValueError(
154
- f"Error downloading the GeoTIFF file from Earth Engine: {e}"
155
- )
156
-
157
- # Create the output directory if it doesn't exist
158
- child_folder: pathlib.Path = full_outname.parent / full_outname.stem
159
- pathlib.Path(child_folder).mkdir(parents=True, exist_ok=True)
160
-
161
- # Split the manifest into four smaller manifests
162
- manifest_dicts = quadsplit_manifest(manifest_dict)
163
-
164
- for idx, manifest_dict_batch in enumerate(manifest_dicts):
165
- # Recursively download the image
166
- getGeoTIFFbatch(
167
- full_outname=child_folder / ("%s__%02d.tif" % (full_outname.stem, idx)),
168
- manifest_dict=manifest_dict_batch,
169
- max_deep_level=max_deep_level - 1,
170
- method=method,
171
- )
172
-
173
- return full_outname
174
-
175
-
176
- def getGeoTIFF(
177
- manifest_dict: dict, full_outname: pathlib.Path, max_deep_level: Optional[int] = 5
178
- ) -> Optional[np.ndarray]:
179
- """
180
- Retrieves an image from Earth Engine using the appropriate method based on the manifest type.
181
-
182
- This function downloads a GeoTIFF image from Google Earth Engine (GEE). Depending on the content of
183
- the provided manifest (`manifest_dict`), the function will either use the `getPixels` method (for
184
- asset-based requests) or the `computePixels` method (for expressions). If the requested area exceeds
185
- the size limit, the image will be recursively split into smaller tiles until the download succeeds or
186
- the maximum recursion depth is reached.
187
-
188
- Args:
189
- manifest_dict (dict): A dictionary containing the image metadata. This should include either:
190
- - `assetId`: The identifier of a GEE asset (e.g., satellite imagery).
191
- - `expression`: A serialized string representing a GEE image expression (e.g., an image computation).
192
- Additionally, the manifest should include grid information such as the image dimensions and affine transformations.
193
-
194
- full_outname (pathlib.Path): The full path where the downloaded GeoTIFF file will be saved.
195
-
196
- max_deep_level (Optional[int]): The maximum recursion depth for splitting large requests into smaller tiles if needed.
197
- Defaults to 5.
198
-
199
- Returns:
200
- Optional[np.ndarray]: The downloaded image as a `numpy` array, or `None` if the download fails. It will
201
- also return the full file path to the saved GeoTIFF image.
202
-
203
- Raises:
204
- ValueError: If the manifest does not contain either an `assetId` or `expression`, or if there is an error during download.
205
-
206
- Example 1: Downloading an image using an `assetId`:
207
- >>> import ee
208
- >>> import pathlib
209
- >>> ee.Initialize()
210
- >>> manifest_dict = {
211
- ... "assetId": "COPERNICUS/S2_HARMONIZED/20160816T153912_20160816T154443_T18TYN",
212
- ... "fileFormat": "GEO_TIFF",
213
- ... "bandIds": ["B4", "B3", "B2"],
214
- ... "grid": {
215
- ... "dimensions": {"width": 512, "height": 512},
216
- ... "affineTransform": {
217
- ... "scaleX": 10,
218
- ... "shearX": 0,
219
- ... "translateX": 725260.108545126,
220
- ... "scaleY": -10,
221
- ... "shearY": 0,
222
- ... "translateY": 4701550.38712196
223
- ... },
224
- ... "crsCode": "EPSG:32618"
225
- ... }
226
- ... }
227
- >>> getGeoTIFF(manifest_dict, pathlib.Path('output/sentinel_image.tif'))
228
- PosixPath('output/sentinel_image.tif')
229
-
230
- Example 2: Downloading an image using an `expression`:
231
- >>> image = ee.Image("COPERNICUS/S2_HARMONIZED/20160816T153912_20160816T154443_T18TYN") \
232
- ... .divide(10_000) \
233
- ... .select(["B4", "B3", "B2"])
234
- >>> expression = image.serialize()
235
- >>> manifest_dict = {
236
- ... "expression": expression,
237
- ... "fileFormat": "GEO_TIFF",
238
- ... "grid": {
239
- ... "dimensions": {"width": 512, "height": 512},
240
- ... "affineTransform": {
241
- ... "scaleX": 10,
242
- ... "shearX": 0,
243
- ... "translateX": 725260.108545126,
244
- ... "scaleY": -10,
245
- ... "shearY": 0,
246
- ... "translateY": 4701550.38712196
247
- ... },
248
- ... "crsCode": "EPSG:32618"
249
- ... }
250
- ... }
251
- >>> getGeoTIFF(manifest_dict, pathlib.Path('output/expression_image.tif'))
252
- PosixPath('output/expression_image.tif')
253
- """
254
- if "assetId" in manifest_dict:
255
- return getGeoTIFFbatch(
256
- manifest_dict=manifest_dict,
257
- full_outname=full_outname,
258
- max_deep_level=max_deep_level,
259
- method="getPixels",
260
- )
261
- elif "expression" in manifest_dict:
262
- if isinstance(
263
- manifest_dict["expression"], str
264
- ): # Decode only if the expression is still a string.
265
- # From a string to a ee.Image object
266
- manifest_dict["expression"] = ee.deserializer.decode(
267
- json.loads(manifest_dict["expression"])
268
- )
269
-
270
- return getGeoTIFFbatch(
271
- manifest_dict=manifest_dict,
272
- full_outname=full_outname,
273
- max_deep_level=max_deep_level,
274
- method="computePixels",
275
- )
276
- else:
277
- raise ValueError("Manifest does not contain 'assetId' or 'expression'")
278
-
279
-
280
- def getcube(
281
- request: RequestSet,
282
- output_path: str | pathlib.Path,
283
- nworkers: Optional[int] = None,
284
- max_deep_level: Optional[int] = 5,
285
- ) -> list[pathlib.Path]:
286
- """
287
- Downloads multiple GeoTIFF images in parallel from Google Earth Engine (GEE) based on the provided request set.
288
-
289
- Args:
290
- request (RequestSet): A collection of image requests containing metadata and processing parameters.
291
- output_path (Union[str, pathlib.Path]): Directory where the downloaded images will be saved.
292
- nworkers (Optional[int], default=None): Number of parallel threads. If None, runs sequentially.
293
- max_deep_level (Optional[int], default=5): Maximum recursion depth for image subdivision if exceeding GEE limits.
294
-
295
- Returns:
296
- List[pathlib.Path]: List of paths to the downloaded GeoTIFF files.
297
-
298
- Example:
299
- >>> import ee, cubexpress
300
- >>> ee.Initialize()
301
- >>> point = ee.Geometry.Point([-97.59, 33.37])
302
- >>> collection = ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED") \
303
- ... .filterBounds(point) \
304
- ... .filterDate('2024-01-01', '2024-01-31')
305
- >>> image_ids = collection.aggregate_array('system:id').getInfo()
306
- >>> geotransform = cubexpress.lonlat2rt(lon=-97.59, lat=33.37, edge_size=128, scale=10)
307
- >>> requests = [cubexpress.Request(id=f"s2_{i}", raster_transform=geotransform, bands=["B4", "B3", "B2"], image=ee.Image(img_id)) for i, img_id in enumerate(image_ids)]
308
- >>> cube_requests = cubexpress.RequestSet(requestset=requests)
309
- >>> cubexpress.getcube(request=cube_requests, nworkers=4, output_path="output", max_deep_level=5)
310
- [PosixPath('output/s2_0.tif'), PosixPath('output/s2_1.tif'), ...]
311
- """
312
-
313
- # Check that _dataframe exists and is not empty
314
- if request._dataframe is None or request._dataframe.empty:
315
- raise ValueError(
316
- "The request's _dataframe is None or empty. "
317
- "There are no valid requests to process."
318
- )
319
-
320
- # **Revalidate** the DataFrame structure, in case the user manipulated it.
321
- request._validate_dataframe_schema()
322
-
323
- # Get the table
324
- table: pd.DataFrame = request._dataframe
325
-
326
- # Create the output directory if it doesn't exist
327
- output_path = pathlib.Path(output_path)
328
- output_path.mkdir(parents=True, exist_ok=True)
329
-
330
- results = []
331
- with ThreadPoolExecutor(max_workers=nworkers) as executor:
332
- futures = {
333
- executor.submit(
334
- getGeoTIFF, row.manifest, output_path / row.outname, max_deep_level
335
- ): row
336
- for _, row in table.iterrows()
337
- }
338
- for future in concurrent.futures.as_completed(futures):
339
- try:
340
- result = future.result()
341
- if result:
342
- results.append(result)
343
- except Exception as e:
344
- # TODO add this into the log
345
- print(f"Error processing {futures[future].outname}: {e}")
346
-
347
- return results
@@ -1,8 +0,0 @@
1
- cubexpress/__init__.py,sha256=1CF6kINn70mfS5HNzYyTf4UsOUPG0qzeetoJSDk0ALw,418
2
- cubexpress/conversion.py,sha256=h77re8AtdVV_Jy3ugZeQ-e2I8DHSKoghiq70MXkzBaQ,2506
3
- cubexpress/download.py,sha256=DX5DKPdKiuv1gHxs-5Q5ScZ06nvE-Pi1YGLSzQc2jrs,14315
4
- cubexpress/geotyping.py,sha256=5JgsOfRfwQf-iBh902wKQ1AxEKw1HgFL2brzwkxO0Pg,17152
5
- cubexpress-0.1.0.dist-info/LICENSE,sha256=XjoS-d76b7Cl-VgCWhQk83tNf2dNldKBN8SrImwGc2Q,1072
6
- cubexpress-0.1.0.dist-info/METADATA,sha256=XfBIfpFP1quHSNr60Dn6R8EEpdq02XJWCepwhl7j7U0,9327
7
- cubexpress-0.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
8
- cubexpress-0.1.0.dist-info/RECORD,,