cubexpress 0.1.7__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cubexpress might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cubexpress
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: Efficient processing of cubic Earth-observation (EO) data.
5
5
  Home-page: https://github.com/andesdatacube/cubexpress
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  from cubexpress.conversion import lonlat2rt, geo2utm
2
2
  from cubexpress.geotyping import RasterTransform, Request, RequestSet
3
- from cubexpress.cloud_utils import cloud_table
3
+ from cubexpress.cloud_utils import s2_cloud_table
4
4
  from cubexpress.cube import get_cube
5
5
  from cubexpress.request import table_to_requestset
6
6
 
@@ -15,7 +15,7 @@ __all__ = [
15
15
  "RequestSet",
16
16
  "geo2utm",
17
17
  "get_cube",
18
- "cloud_table",
18
+ "s2_cloud_table",
19
19
  "table_to_requestset"
20
20
  ]
21
21
 
@@ -13,10 +13,6 @@ Both return a ``pandas.DataFrame`` with the columns **day**, **cloudPct** and
13
13
  from __future__ import annotations
14
14
 
15
15
  import datetime as dt
16
- import json
17
- import pathlib
18
- from typing import List, Optional
19
-
20
16
  import ee
21
17
  import pandas as pd
22
18
 
@@ -28,102 +24,97 @@ def _cloud_table_single_range(
28
24
  lon: float,
29
25
  lat: float,
30
26
  edge_size: int,
31
- scale: int,
32
27
  start: str,
33
- end: str,
34
- collection: str = "COPERNICUS/S2_HARMONIZED",
28
+ end: str
35
29
  ) -> pd.DataFrame:
36
- """Return raw cloud-table rows for a single *start–end* interval.
30
+ """
31
+ Build a daily cloud-score table for a square Sentinel-2 footprint.
37
32
 
38
33
  Parameters
39
34
  ----------
40
- lon, lat
41
- Centre coordinates in decimal degrees.
42
- edge_size, scale
43
- ROI size in pixels (*edge_size*) and pixel resolution in metres
44
- (*scale*), fed into :pyfunc:`cubexpress.geospatial._square_roi`.
45
- start, end
46
- ISO-dates (``YYYY-MM-DD``) delimiting the query.
47
- collection
48
- Sentinel-2 collection name to query.
35
+ lon, lat : float
36
+ Point at the centre of the requested region (°).
37
+ edge_size : int
38
+ Side length of the square region in Sentinel-2 pixels (10 m each).
39
+ start, end : str
40
+ ISO-8601 dates delimiting the period, e.g. ``"2024-06-01"``.
49
41
 
50
42
  Returns
51
43
  -------
52
44
  pandas.DataFrame
53
- Columns: **day** (str), **cloudPct** (float), **images** (str
54
- concatenation of asset IDs separated by ``-``). No filtering applied.
45
+ One row per image with columns:
46
+ * ``id`` Sentinel-2 ID
47
+ * ``cs_cdf`` – Cloud Score Plus CDF (0–1)
48
+ * ``date`` – acquisition date (YYYY-MM-DD)
49
+ * ``high_null_flag`` – 1 if cloud score missing
50
+
51
+ Notes
52
+ -----
53
+ Missing ``cs_cdf`` values are filled with the mean of the same day.
55
54
  """
56
- roi = _square_roi(lon, lat, edge_size, scale)
57
- s2 = ee.ImageCollection(collection)
58
-
59
- if collection in (
60
- "COPERNICUS/S2_HARMONIZED",
61
- "COPERNICUS/S2_SR_HARMONIZED",
62
- ):
63
- qa_band = "cs_cdf"
64
- csp = ee.ImageCollection("GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED")
65
- else:
66
- qa_band, csp = None, None
67
-
68
- def _add_props(img):
69
- day = ee.Date(img.get("system:time_start")).format("YYYY-MM-dd")
70
- imgid = img.get("system:index")
71
-
72
- if qa_band:
73
- score = (
74
- img.linkCollection(csp, [qa_band])
75
- .select([qa_band])
76
- .reduceRegion(ee.Reducer.mean(), roi, scale)
77
- .get(qa_band)
78
- )
79
- # If score is null assume completely clear (score=1 → cloudPct=0)
80
- score_safe = ee.Algorithms.If(score, score, -1)
81
- cloud_pct = (
82
- ee.Number(1)
83
- .subtract(ee.Number(score_safe))
84
- .multiply(10000)
85
- .round()
86
- .divide(100)
87
- )
88
- else:
89
- cloud_pct = ee.Number(-1)
90
-
91
- return ee.Feature(
92
- None,
93
- {
94
- "day": day,
95
- "cloudPct": cloud_pct,
96
- "images": imgid,
97
- },
98
- )
99
55
 
100
- triples = (
101
- s2.filterDate(start, end)
56
+ center = ee.Geometry.Point([lon, lat])
57
+ roi = _square_roi(lon, lat, edge_size, 10)
58
+
59
+ s2 = (
60
+ ee.ImageCollection("COPERNICUS/S2_HARMONIZED")
102
61
  .filterBounds(roi)
103
- .map(_add_props)
104
- .reduceColumns(ee.Reducer.toList(3), ["day", "cloudPct", "images"])
105
- .get("list")
106
- .getInfo()
62
+ .filterDate(start, end)
63
+ )
64
+
65
+ csp = ee.ImageCollection("GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED")
66
+
67
+ ic = (
68
+ s2
69
+ .linkCollection(csp, ["cs_cdf"])
70
+ .select(["cs_cdf"])
107
71
  )
108
72
 
109
- df = pd.DataFrame(triples, columns=["day", "cloudPct", "images"]).dropna()
110
- df["cloudPct"] = df["cloudPct"].astype(float)
111
- df["images"] = df["images"].astype(str)
73
+ # image IDs for every expected date
74
+ ids = ic.aggregate_array("system:index").getInfo()
75
+ df_ids = pd.DataFrame({"id": ids})
76
+
77
+
78
+ region_scale = edge_size * 10 / 2
79
+
80
+
81
+ try:
82
+ raw = ic.getRegion(geometry=center, scale=region_scale).getInfo()
83
+ except ee.ee_exception.EEException as e:
84
+ if "No bands in collection" in str(e):
85
+ return pd.DataFrame(
86
+ columns=["id", "cs_cdf", "date", "high_null_flag"]
87
+ )
88
+ raise
89
+
90
+ df_raw = pd.DataFrame(raw[1:], columns=raw[0])
91
+
92
+
93
+ df = (
94
+ df_ids
95
+ .merge(df_raw, on="id", how="left")
96
+ .assign(
97
+ date=lambda d: pd.to_datetime(d["id"].str[:8], format="%Y%m%d").dt.strftime("%Y-%m-%d"),
98
+ high_null_flag=lambda d: d["cs_cdf"].isna().astype(int),
99
+ )
100
+ .drop(columns=["longitude", "latitude", "time"])
101
+ )
102
+
103
+ # fill missing scores with daily mean
104
+ df["cs_cdf"] = df["cs_cdf"].fillna(df.groupby("date")["cs_cdf"].transform("mean"))
105
+
112
106
  return df
113
107
 
114
108
 
115
- def cloud_table(
109
+ def s2_cloud_table(
116
110
  lon: float,
117
111
  lat: float,
118
- edge_size: int = 2048,
119
- scale: int = 10,
120
- start: str = "2017-01-01",
121
- end: str = "2024-12-31",
122
- cloud_max: float = 7.0,
123
- bands: Optional[List[str]] = None,
124
- collection: str = "COPERNICUS/S2_HARMONIZED",
125
- output_path: str | pathlib.Path | None = None,
126
- cache: bool = True,
112
+ edge_size: int,
113
+ start: str,
114
+ end: str,
115
+ max_cscore: float = 1.0,
116
+ min_cscore: float = 0.0,
117
+ cache: bool = False,
127
118
  verbose: bool = True,
128
119
  ) -> pd.DataFrame:
129
120
  """Build (and cache) a per-day cloud-table for the requested ROI.
@@ -161,23 +152,10 @@ def cloud_table(
161
152
  pandas.DataFrame
162
153
  Filtered cloud table with ``.attrs`` containing the call parameters.
163
154
  """
164
- if bands is None:
165
- bands = [
166
- "B1",
167
- "B2",
168
- "B3",
169
- "B4",
170
- "B5",
171
- "B6",
172
- "B7",
173
- "B8",
174
- "B8A",
175
- "B9",
176
- "B10",
177
- "B11",
178
- "B12",
179
- ]
180
155
 
156
+ bands = ["B1", "B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B9", "B10", "B11", "B12"]
157
+ collection = "COPERNICUS/S2_HARMONIZED"
158
+ scale = 10
181
159
  cache_file = _cache_key(lon, lat, edge_size, scale, collection)
182
160
 
183
161
  # ─── 1. Load cached data if present ────────────────────────────────────
@@ -185,7 +163,7 @@ def cloud_table(
185
163
  if verbose:
186
164
  print("📂 Loading cached table …")
187
165
  df_cached = pd.read_parquet(cache_file)
188
- have_idx = pd.to_datetime(df_cached["day"], errors="coerce").dropna()
166
+ have_idx = pd.to_datetime(df_cached["date"], errors="coerce").dropna()
189
167
 
190
168
  cached_start = have_idx.min().date()
191
169
  cached_end = have_idx.max().date()
@@ -204,39 +182,40 @@ def cloud_table(
204
182
  a1, b1 = start, cached_start.isoformat()
205
183
  df_new_parts.append(
206
184
  _cloud_table_single_range(
207
- lon, lat, edge_size, scale, a1, b1, collection
185
+ lon, lat, edge_size, a1, b1
208
186
  )
209
187
  )
210
188
  if dt.date.fromisoformat(end) > cached_end:
211
189
  a2, b2 = cached_end.isoformat(), end
212
190
  df_new_parts.append(
213
191
  _cloud_table_single_range(
214
- lon, lat, edge_size, scale, a2, b2, collection
192
+ lon, lat, edge_size, a2, b2
215
193
  )
216
194
  )
217
195
  df_new = pd.concat(df_new_parts, ignore_index=True)
218
196
  df_full = (
219
197
  pd.concat([df_cached, df_new], ignore_index=True)
220
- .drop_duplicates("day")
221
- .sort_values("day", kind="mergesort")
198
+ .sort_values("date", kind="mergesort")
222
199
  )
223
200
  else:
224
- # No cache or caching disabled: fetch full range.
201
+
225
202
  if verbose:
226
203
  msg = "Generating table (no cache found)…" if cache else "Generating table…"
227
204
  print("⏳", msg)
228
205
  df_full = _cloud_table_single_range(
229
- lon, lat, edge_size, scale, start, end, collection
206
+ lon, lat, edge_size, start, end
230
207
  )
208
+
231
209
 
232
210
  # ─── 2. Save cache ─────────────────────────────────────────────────────
233
211
  if cache:
234
212
  df_full.to_parquet(cache_file, compression="zstd")
235
213
 
236
214
  # ─── 3. Filter by cloud cover and requested date window ────────────────
215
+
237
216
  result = (
238
- df_full.query("@start <= day <= @end")
239
- .query("cloudPct < @cloud_max")
217
+ df_full.query("@start <= date <= @end")
218
+ .query("@min_cscore <= cs_cdf <= @max_cscore")
240
219
  .reset_index(drop=True)
241
220
  )
242
221
 
@@ -248,9 +227,7 @@ def cloud_table(
248
227
  "edge_size": edge_size,
249
228
  "scale": scale,
250
229
  "bands": bands,
251
- "collection": collection,
252
- "cloud_max": cloud_max,
253
- "output_path": str(output_path) if output_path else "",
230
+ "collection": collection
254
231
  }
255
232
  )
256
- return result
233
+ return result
@@ -16,17 +16,22 @@ from __future__ import annotations
16
16
  import pathlib
17
17
  import concurrent.futures
18
18
  from typing import Dict, Any
19
-
20
19
  import ee
20
+
21
+
21
22
  from cubexpress.downloader import download_manifest, download_manifests
22
23
  from cubexpress.geospatial import quadsplit_manifest, calculate_cell_size
23
- from cubexpress.geotyping import RequestSet
24
+ from cubexpress.request import table_to_requestset
25
+ import pandas as pd
24
26
 
25
27
 
26
28
  def get_geotiff(
27
29
  manifest: Dict[str, Any],
28
30
  full_outname: pathlib.Path | str,
31
+ join: bool = True,
32
+ eraser: bool = True,
29
33
  nworks: int = 4,
34
+ verbose: bool = True,
30
35
  ) -> None:
31
36
  """Download *manifest* to *full_outname*, retrying with tiled requests.
32
37
 
@@ -39,19 +44,26 @@ def get_geotiff(
39
44
  nworks
40
45
  Maximum worker threads when the image must be split; default **4**.
41
46
  """
47
+ full_outname = pathlib.Path(full_outname)
42
48
  try:
43
- download_manifest(manifest, pathlib.Path(full_outname))
49
+ download_manifest(manifest, full_outname)
44
50
  except ee.ee_exception.EEException as err:
45
- # Handle EE “too many pixels” error by recursive tiling.
51
+
46
52
  size = manifest["grid"]["dimensions"]["width"] # square images assumed
47
53
  cell_w, cell_h, power = calculate_cell_size(str(err), size)
48
54
  tiled = quadsplit_manifest(manifest, cell_w, cell_h, power)
49
- download_manifests(tiled, max_workers=nworks, full_outname=pathlib.Path(full_outname))
55
+ download_manifests(tiled, full_outname, join, eraser, nworks)
56
+
57
+ if verbose:
58
+ print(f"Downloaded {full_outname}")
50
59
 
51
60
 
52
61
  def get_cube(
53
- requests: RequestSet,
62
+ table: pd.DataFrame,
54
63
  outfolder: pathlib.Path | str,
64
+ join: bool = True,
65
+ eraser: bool = True,
66
+ mosaic: bool = True,
55
67
  nworks: int = 4,
56
68
  ) -> None:
57
69
  """Download every request in *requests* to *outfolder* using a thread pool.
@@ -68,14 +80,18 @@ def get_cube(
68
80
  nworks
69
81
  Pool size for concurrent downloads; default **4**.
70
82
  """
71
- out = pathlib.Path(outfolder)
83
+
84
+ requests = table_to_requestset(
85
+ table=table,
86
+ mosaic=mosaic
87
+ )
72
88
 
73
89
  with concurrent.futures.ThreadPoolExecutor(max_workers=nworks) as pool:
74
90
  futures = []
75
91
  for _, row in requests._dataframe.iterrows():
76
- outname = out / f"{row.id}.tif"
92
+ outname = pathlib.Path(outfolder) / f"{row.id}.tif"
77
93
  outname.parent.mkdir(parents=True, exist_ok=True)
78
- futures.append(pool.submit(get_geotiff, row.manifest, outname, nworks))
94
+ futures.append(pool.submit(get_geotiff, row.manifest, outname, join, eraser, nworks))
79
95
 
80
96
  for fut in concurrent.futures.as_completed(futures):
81
97
  try:
@@ -21,7 +21,10 @@ import ee
21
21
  import rasterio as rio
22
22
  from rasterio.io import MemoryFile
23
23
  import logging
24
+ from rasterio.merge import merge
25
+ from rasterio.enums import Resampling
24
26
  import os
27
+ import shutil
25
28
 
26
29
  os.environ['CPL_LOG_ERRORS'] = 'OFF'
27
30
  logging.getLogger('rasterio._env').setLevel(logging.ERROR)
@@ -53,7 +56,7 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
53
56
  blockxsize=256,
54
57
  blockysize=256,
55
58
  compress="ZSTD",
56
- zstd_level=13,
59
+ # zstd_level=13,
57
60
  predictor=2,
58
61
  num_threads=20,
59
62
  nodata=65535,
@@ -65,13 +68,12 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
65
68
  with rio.open(full_outname, "w", **profile) as dst:
66
69
  dst.write(src.read())
67
70
 
68
- print(f"{full_outname} downloaded successfully.") # noqa: T201
69
-
70
-
71
71
  def download_manifests(
72
72
  manifests: List[Dict[str, Any]],
73
- max_workers: int,
74
73
  full_outname: pathlib.Path,
74
+ join: bool = True,
75
+ eraser: bool = True,
76
+ max_workers: int = 4,
75
77
  ) -> None:
76
78
  """Download every manifest in *manifests* concurrently.
77
79
 
@@ -93,3 +95,41 @@ def download_manifests(
93
95
  fut.result()
94
96
  except Exception as exc: # noqa: BLE001
95
97
  print(f"Error en una de las descargas: {exc}") # noqa: T201
98
+
99
+ if join:
100
+
101
+ dir_path = full_outname.parent / full_outname.stem
102
+ input_files = sorted(dir_path.glob("*.tif"))
103
+
104
+ if dir_path.exists() and len(input_files) > 1:
105
+
106
+ with rio.Env(GDAL_NUM_THREADS="8", NUM_THREADS="8"):
107
+ srcs = [rio.open(fp) for fp in input_files]
108
+ mosaic, out_transform = merge(
109
+ srcs,
110
+ nodata=65535,
111
+ resampling=Resampling.nearest
112
+ )
113
+
114
+ meta = srcs[0].profile.copy()
115
+ meta["transform"] = out_transform
116
+ meta.update(
117
+ height=mosaic.shape[1],
118
+ width=mosaic.shape[2]
119
+ )
120
+
121
+ with rio.open(full_outname, "w", **meta) as dst:
122
+ dst.write(mosaic)
123
+
124
+ for src in srcs:
125
+ src.close()
126
+
127
+ if eraser:
128
+ # Delete a folder with pathlib
129
+ shutil.rmtree(dir_path)
130
+
131
+ print("✅ Mosaico generado:", full_outname)
132
+ return full_outname
133
+
134
+ else:
135
+ return full_outname
@@ -482,7 +482,7 @@ class RequestSet(BaseModel):
482
482
  str: A string representation of the entire RasterTransformSet.
483
483
  """
484
484
  num_entries = len(self.requestset)
485
- return f"RasterTransformSet({num_entries} entries)"
485
+ return f"RequestSet({num_entries} entries)"
486
486
 
487
487
  def __str__(self):
488
488
  return super().__repr__()
@@ -5,13 +5,15 @@ from __future__ import annotations
5
5
  import ee
6
6
  import pandas as pd
7
7
  import pygeohash as pgh
8
- from typing import List
9
8
 
10
9
  from cubexpress.geotyping import Request, RequestSet
11
10
  from cubexpress.conversion import lonlat2rt
12
11
 
13
12
 
14
- def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
13
+ def table_to_requestset(
14
+ table: pd.DataFrame,
15
+ mosaic: bool = True
16
+ ) -> RequestSet:
15
17
  """Return a :class:`RequestSet` built from *df* (cloud_table result).
16
18
 
17
19
  Parameters
@@ -29,6 +31,10 @@ def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
29
31
  If *df* is empty after filtering.
30
32
 
31
33
  """
34
+
35
+
36
+ df = table.copy()
37
+
32
38
  if df.empty:
33
39
  raise ValueError("cloud_table returned no rows; nothing to request.")
34
40
 
@@ -39,24 +45,23 @@ def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
39
45
  scale=df.attrs["scale"],
40
46
  )
41
47
  centre_hash = pgh.encode(df.attrs["lat"], df.attrs["lon"], precision=5)
42
- reqs: List[Request] = []
48
+ reqs: list[Request] = []
43
49
 
44
50
  if mosaic:
45
51
  # group all asset IDs per day
46
52
  grouped = (
47
- df.assign(img=lambda x: x.images.str.split("-"))
48
- .explode("img")
49
- .groupby("day")["img"]
50
- .apply(list)
53
+ df.groupby("date")["id"] # Series con listas de ids por día
54
+ .apply(list)
51
55
  )
52
56
 
53
57
  for day, img_ids in grouped.items():
54
58
  ee_img = ee.ImageCollection(
55
59
  [ee.Image(f"{df.attrs['collection']}/{img}") for img in img_ids]
56
60
  ).mosaic()
61
+
57
62
  reqs.append(
58
63
  Request(
59
- id=f"{day}_{centre_hash}_mosaic",
64
+ id=f"{day}_{centre_hash}",
60
65
  raster_transform=rt,
61
66
  image=ee_img,
62
67
  bands=df.attrs["bands"],
@@ -64,14 +69,16 @@ def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
64
69
  )
65
70
  else: # one request per asset
66
71
  for _, row in df.iterrows():
67
- for img_id in row["images"].split("-"):
68
- reqs.append(
69
- Request(
70
- id=f"{row['day']}_{centre_hash}_{img_id}",
71
- raster_transform=rt,
72
- image=f"{df.attrs['collection']}/{img_id}",
73
- bands=df.attrs["bands"],
74
- )
72
+ img_id = row["id"]
73
+ day = row["date"]
74
+
75
+ reqs.append(
76
+ Request(
77
+ id=f"{day}_{centre_hash}_{img_id}",
78
+ raster_transform=rt,
79
+ image=f"{df.attrs['collection']}/{img_id}",
80
+ bands=df.attrs["bands"],
75
81
  )
82
+ )
76
83
 
77
84
  return RequestSet(requestset=reqs)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cubexpress"
3
- version = "0.1.7"
3
+ version = "0.1.9"
4
4
  description = "Efficient processing of cubic Earth-observation (EO) data."
5
5
  authors = [
6
6
  "Julio Contreras <contrerasnetk@gmail.com>",
File without changes
File without changes