cubexpress 0.1.6__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cubexpress might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cubexpress
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: Efficient processing of cubic Earth-observation (EO) data.
5
5
  Home-page: https://github.com/andesdatacube/cubexpress
6
6
  License: MIT
@@ -13,10 +13,6 @@ Both return a ``pandas.DataFrame`` with the columns **day**, **cloudPct** and
13
13
  from __future__ import annotations
14
14
 
15
15
  import datetime as dt
16
- import json
17
- import pathlib
18
- from typing import List, Optional
19
-
20
16
  import ee
21
17
  import pandas as pd
22
18
 
@@ -28,10 +24,8 @@ def _cloud_table_single_range(
28
24
  lon: float,
29
25
  lat: float,
30
26
  edge_size: int,
31
- scale: int,
32
27
  start: str,
33
- end: str,
34
- collection: str = "COPERNICUS/S2_HARMONIZED",
28
+ end: str
35
29
  ) -> pd.DataFrame:
36
30
  """Return raw cloud-table rows for a single *start–end* interval.
37
31
 
@@ -53,76 +47,64 @@ def _cloud_table_single_range(
53
47
  Columns: **day** (str), **cloudPct** (float), **images** (str
54
48
  concatenation of asset IDs separated by ``-``). No filtering applied.
55
49
  """
56
- roi = _square_roi(lon, lat, edge_size, scale)
57
- s2 = ee.ImageCollection(collection)
58
-
59
- if collection in (
60
- "COPERNICUS/S2_HARMONIZED",
61
- "COPERNICUS/S2_SR_HARMONIZED",
62
- ):
63
- qa_band = "cs_cdf"
64
- csp = ee.ImageCollection("GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED")
65
- else:
66
- qa_band, csp = None, None
67
-
68
- def _add_props(img):
69
- day = ee.Date(img.get("system:time_start")).format("YYYY-MM-dd")
70
- imgid = img.get("system:index")
71
-
72
- if qa_band:
73
- score = (
74
- img.linkCollection(csp, [qa_band])
75
- .select([qa_band])
76
- .reduceRegion(ee.Reducer.mean(), roi, scale)
77
- .get(qa_band)
78
- )
79
- # If score is null assume completely clear (score=1 → cloudPct=0)
80
- score_safe = ee.Algorithms.If(score, score, -1)
81
- cloud_pct = (
82
- ee.Number(1)
83
- .subtract(ee.Number(score_safe))
84
- .multiply(10000)
85
- .round()
86
- .divide(100)
87
- )
88
- else:
89
- cloud_pct = ee.Number(-1)
90
-
91
- return ee.Feature(
92
- None,
93
- {
94
- "day": day,
95
- "cloudPct": cloud_pct,
96
- "images": imgid,
97
- },
98
- )
99
50
 
100
- triples = (
101
- s2.filterDate(start, end)
51
+ center = ee.Geometry.Point([lon, lat])
52
+ roi = _square_roi(lon, lat, edge_size, 10)
53
+
54
+ s2 = (
55
+ ee.ImageCollection("COPERNICUS/S2_HARMONIZED")
102
56
  .filterBounds(roi)
103
- .map(_add_props)
104
- .reduceColumns(ee.Reducer.toList(3), ["day", "cloudPct", "images"])
105
- .get("list")
106
- .getInfo()
57
+ .filterDate(start, end)
58
+ )
59
+
60
+ csp = ee.ImageCollection("GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED")
61
+
62
+ ic = (
63
+ s2
64
+ .linkCollection(csp, ["cs_cdf"])
65
+ .select(["cs_cdf"])
66
+ )
67
+ ids = ic.aggregate_array("system:index").getInfo()
68
+ df_ids = pd.DataFrame({"id": ids})
69
+
70
+
71
+ region_scale = edge_size * 10 / 2
72
+
73
+
74
+ try:
75
+ raw = ic.getRegion(geometry=center, scale=region_scale).getInfo()
76
+ except ee.ee_exception.EEException as e:
77
+ if "No bands in collection" in str(e):
78
+ return pd.DataFrame(
79
+ columns=["id", "cs_cdf", "date", "high_null_flag"]
80
+ )
81
+ raise
82
+
83
+ df_raw = pd.DataFrame(raw[1:], columns=raw[0])
84
+
85
+
86
+ df = (
87
+ df_ids
88
+ .merge(df_raw, on="id", how="left")
89
+ .assign(
90
+ date=lambda d: pd.to_datetime(d["id"].str[:8], format="%Y%m%d").dt.strftime("%Y-%m-%d"),
91
+ high_null_flag=lambda d: d["cs_cdf"].isna().astype(int),
92
+ )
93
+ .drop(columns=["longitude", "latitude", "time"])
107
94
  )
108
95
 
109
- df = pd.DataFrame(triples, columns=["day", "cloudPct", "images"]).dropna()
110
- df["cloudPct"] = df["cloudPct"].astype(float)
111
- df["images"] = df["images"].astype(str)
96
+ df["cs_cdf"] = df["cs_cdf"].fillna(df.groupby("date")["cs_cdf"].transform("mean"))
97
+
112
98
  return df
113
99
 
114
100
 
115
- def cloud_table(
101
+ def s2_cloud_table(
116
102
  lon: float,
117
103
  lat: float,
118
104
  edge_size: int = 2048,
119
- scale: int = 10,
120
105
  start: str = "2017-01-01",
121
106
  end: str = "2024-12-31",
122
- cloud_max: float = 7.0,
123
- bands: Optional[List[str]] = None,
124
- collection: str = "COPERNICUS/S2_HARMONIZED",
125
- output_path: str | pathlib.Path | None = None,
107
+ cscore: float = 0.5,
126
108
  cache: bool = True,
127
109
  verbose: bool = True,
128
110
  ) -> pd.DataFrame:
@@ -161,23 +143,10 @@ def cloud_table(
161
143
  pandas.DataFrame
162
144
  Filtered cloud table with ``.attrs`` containing the call parameters.
163
145
  """
164
- if bands is None:
165
- bands = [
166
- "B1",
167
- "B2",
168
- "B3",
169
- "B4",
170
- "B5",
171
- "B6",
172
- "B7",
173
- "B8",
174
- "B8A",
175
- "B9",
176
- "B10",
177
- "B11",
178
- "B12",
179
- ]
180
146
 
147
+ bands = ["B1", "B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B9", "B10", "B11", "B12"]
148
+ collection = "COPERNICUS/S2_HARMONIZED"
149
+ scale = 10
181
150
  cache_file = _cache_key(lon, lat, edge_size, scale, collection)
182
151
 
183
152
  # ─── 1. Load cached data if present ────────────────────────────────────
@@ -185,7 +154,7 @@ def cloud_table(
185
154
  if verbose:
186
155
  print("📂 Loading cached table …")
187
156
  df_cached = pd.read_parquet(cache_file)
188
- have_idx = pd.to_datetime(df_cached["day"], errors="coerce").dropna()
157
+ have_idx = pd.to_datetime(df_cached["date"], errors="coerce").dropna()
189
158
 
190
159
  cached_start = have_idx.min().date()
191
160
  cached_end = have_idx.max().date()
@@ -204,39 +173,40 @@ def cloud_table(
204
173
  a1, b1 = start, cached_start.isoformat()
205
174
  df_new_parts.append(
206
175
  _cloud_table_single_range(
207
- lon, lat, edge_size, scale, a1, b1, collection
176
+ lon, lat, edge_size, a1, b1
208
177
  )
209
178
  )
210
179
  if dt.date.fromisoformat(end) > cached_end:
211
180
  a2, b2 = cached_end.isoformat(), end
212
181
  df_new_parts.append(
213
182
  _cloud_table_single_range(
214
- lon, lat, edge_size, scale, a2, b2, collection
183
+ lon, lat, edge_size, a2, b2
215
184
  )
216
185
  )
217
186
  df_new = pd.concat(df_new_parts, ignore_index=True)
218
187
  df_full = (
219
188
  pd.concat([df_cached, df_new], ignore_index=True)
220
- .drop_duplicates("day")
221
- .sort_values("day", kind="mergesort")
189
+ .sort_values("date", kind="mergesort")
222
190
  )
223
191
  else:
224
- # No cache or caching disabled: fetch full range.
192
+
225
193
  if verbose:
226
194
  msg = "Generating table (no cache found)…" if cache else "Generating table…"
227
195
  print("⏳", msg)
228
196
  df_full = _cloud_table_single_range(
229
- lon, lat, edge_size, scale, start, end, collection
197
+ lon, lat, edge_size, start, end
230
198
  )
199
+
231
200
 
232
201
  # ─── 2. Save cache ─────────────────────────────────────────────────────
233
202
  if cache:
234
203
  df_full.to_parquet(cache_file, compression="zstd")
235
204
 
236
205
  # ─── 3. Filter by cloud cover and requested date window ────────────────
206
+
237
207
  result = (
238
- df_full.query("@start <= day <= @end")
239
- .query("cloudPct < @cloud_max")
208
+ df_full.query("@start <= date <= @end")
209
+ .query("cs_cdf > @cscore")
240
210
  .reset_index(drop=True)
241
211
  )
242
212
 
@@ -248,9 +218,7 @@ def cloud_table(
248
218
  "edge_size": edge_size,
249
219
  "scale": scale,
250
220
  "bands": bands,
251
- "collection": collection,
252
- "cloud_max": cloud_max,
253
- "output_path": str(output_path) if output_path else "",
221
+ "collection": collection
254
222
  }
255
223
  )
256
224
  return result
@@ -20,7 +20,11 @@ from typing import Any, Dict, List
20
20
  import ee
21
21
  import rasterio as rio
22
22
  from rasterio.io import MemoryFile
23
+ import logging
24
+ import os
23
25
 
26
+ os.environ['CPL_LOG_ERRORS'] = 'OFF'
27
+ logging.getLogger('rasterio._env').setLevel(logging.ERROR)
24
28
 
25
29
  def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None:
26
30
  """Download *ulist* and save it as *full_outname*.
@@ -29,49 +29,54 @@ def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
29
29
  If *df* is empty after filtering.
30
30
 
31
31
  """
32
- if df.empty:
32
+
33
+
34
+ df_ = df.copy()
35
+
36
+ if df_.empty:
33
37
  raise ValueError("cloud_table returned no rows; nothing to request.")
34
38
 
35
39
  rt = lonlat2rt(
36
- lon=df.attrs["lon"],
37
- lat=df.attrs["lat"],
38
- edge_size=df.attrs["edge_size"],
39
- scale=df.attrs["scale"],
40
+ lon=df_.attrs["lon"],
41
+ lat=df_.attrs["lat"],
42
+ edge_size=df_.attrs["edge_size"],
43
+ scale=df_.attrs["scale"],
40
44
  )
41
- centre_hash = pgh.encode(df.attrs["lat"], df.attrs["lon"], precision=5)
42
- reqs: List[Request] = []
45
+ centre_hash = pgh.encode(df_.attrs["lat"], df_.attrs["lon"], precision=5)
46
+ reqs: list[Request] = []
43
47
 
44
48
  if mosaic:
45
49
  # group all asset IDs per day
46
50
  grouped = (
47
- df.assign(img=lambda x: x.images.str.split("-"))
48
- .explode("img")
49
- .groupby("day")["img"]
50
- .apply(list)
51
+ df_.groupby("date")["id"] # Series con listas de ids por día
52
+ .apply(list)
51
53
  )
52
54
 
53
55
  for day, img_ids in grouped.items():
54
56
  ee_img = ee.ImageCollection(
55
- [ee.Image(f"{df.attrs['collection']}/{img}") for img in img_ids]
57
+ [ee.Image(f"{df_.attrs['collection']}/{img}") for img in img_ids]
56
58
  ).mosaic()
59
+
57
60
  reqs.append(
58
61
  Request(
59
- id=f"{day}_{centre_hash}_mosaic",
62
+ id=f"{day}_{centre_hash}",
60
63
  raster_transform=rt,
61
64
  image=ee_img,
62
- bands=df.attrs["bands"],
65
+ bands=df_.attrs["bands"],
63
66
  )
64
67
  )
65
68
  else: # one request per asset
66
- for _, row in df.iterrows():
67
- for img_id in row["images"].split("-"):
68
- reqs.append(
69
- Request(
70
- id=f"{row['day']}_{centre_hash}_{img_id}",
71
- raster_transform=rt,
72
- image=f"{df.attrs['collection']}/{img_id}",
73
- bands=df.attrs["bands"],
74
- )
69
+ for _, row in df_.iterrows():
70
+ img_id = row["id"]
71
+ day = row["date"]
72
+
73
+ reqs.append(
74
+ Request(
75
+ id=f"{day}_{centre_hash}_{img_id}",
76
+ raster_transform=rt,
77
+ image=f"{df_.attrs['collection']}/{img_id}",
78
+ bands=df_.attrs["bands"],
75
79
  )
80
+ )
76
81
 
77
82
  return RequestSet(requestset=reqs)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cubexpress"
3
- version = "0.1.6"
3
+ version = "0.1.8"
4
4
  description = "Efficient processing of cubic Earth-observation (EO) data."
5
5
  authors = [
6
6
  "Julio Contreras <contrerasnetk@gmail.com>",
File without changes
File without changes