cubexpress 0.1.8__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cubexpress might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cubexpress
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: Efficient processing of cubic Earth-observation (EO) data.
5
5
  Home-page: https://github.com/andesdatacube/cubexpress
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  from cubexpress.conversion import lonlat2rt, geo2utm
2
2
  from cubexpress.geotyping import RasterTransform, Request, RequestSet
3
- from cubexpress.cloud_utils import cloud_table
3
+ from cubexpress.cloud_utils import s2_cloud_table
4
4
  from cubexpress.cube import get_cube
5
5
  from cubexpress.request import table_to_requestset
6
6
 
@@ -15,7 +15,7 @@ __all__ = [
15
15
  "RequestSet",
16
16
  "geo2utm",
17
17
  "get_cube",
18
- "cloud_table",
18
+ "s2_cloud_table",
19
19
  "table_to_requestset"
20
20
  ]
21
21
 
@@ -27,25 +27,30 @@ def _cloud_table_single_range(
27
27
  start: str,
28
28
  end: str
29
29
  ) -> pd.DataFrame:
30
- """Return raw cloud-table rows for a single *start–end* interval.
30
+ """
31
+ Build a daily cloud-score table for a square Sentinel-2 footprint.
31
32
 
32
33
  Parameters
33
34
  ----------
34
- lon, lat
35
- Centre coordinates in decimal degrees.
36
- edge_size, scale
37
- ROI size in pixels (*edge_size*) and pixel resolution in metres
38
- (*scale*), fed into :pyfunc:`cubexpress.geospatial._square_roi`.
39
- start, end
40
- ISO-dates (``YYYY-MM-DD``) delimiting the query.
41
- collection
42
- Sentinel-2 collection name to query.
35
+ lon, lat : float
36
+ Point at the centre of the requested region (°).
37
+ edge_size : int
38
+ Side length of the square region in Sentinel-2 pixels (10 m each).
39
+ start, end : str
40
+ ISO-8601 dates delimiting the period, e.g. ``"2024-06-01"``.
43
41
 
44
42
  Returns
45
43
  -------
46
44
  pandas.DataFrame
47
- Columns: **day** (str), **cloudPct** (float), **images** (str
48
- concatenation of asset IDs separated by ``-``). No filtering applied.
45
+ One row per image with columns:
46
+ * ``id`` Sentinel-2 ID
47
+ * ``cs_cdf`` – Cloud Score Plus CDF (0–1)
48
+ * ``date`` – acquisition date (YYYY-MM-DD)
49
+ * ``high_null_flag`` – 1 if cloud score missing
50
+
51
+ Notes
52
+ -----
53
+ Missing ``cs_cdf`` values are filled with the mean of the same day.
49
54
  """
50
55
 
51
56
  center = ee.Geometry.Point([lon, lat])
@@ -64,6 +69,8 @@ def _cloud_table_single_range(
64
69
  .linkCollection(csp, ["cs_cdf"])
65
70
  .select(["cs_cdf"])
66
71
  )
72
+
73
+ # image IDs for every expected date
67
74
  ids = ic.aggregate_array("system:index").getInfo()
68
75
  df_ids = pd.DataFrame({"id": ids})
69
76
 
@@ -93,6 +100,7 @@ def _cloud_table_single_range(
93
100
  .drop(columns=["longitude", "latitude", "time"])
94
101
  )
95
102
 
103
+ # fill missing scores with daily mean
96
104
  df["cs_cdf"] = df["cs_cdf"].fillna(df.groupby("date")["cs_cdf"].transform("mean"))
97
105
 
98
106
  return df
@@ -101,11 +109,12 @@ def _cloud_table_single_range(
101
109
  def s2_cloud_table(
102
110
  lon: float,
103
111
  lat: float,
104
- edge_size: int = 2048,
105
- start: str = "2017-01-01",
106
- end: str = "2024-12-31",
107
- cscore: float = 0.5,
108
- cache: bool = True,
112
+ edge_size: int,
113
+ start: str,
114
+ end: str,
115
+ max_cscore: float = 1.0,
116
+ min_cscore: float = 0.0,
117
+ cache: bool = False,
109
118
  verbose: bool = True,
110
119
  ) -> pd.DataFrame:
111
120
  """Build (and cache) a per-day cloud-table for the requested ROI.
@@ -206,7 +215,7 @@ def s2_cloud_table(
206
215
 
207
216
  result = (
208
217
  df_full.query("@start <= date <= @end")
209
- .query("cs_cdf > @cscore")
218
+ .query("@min_cscore <= cs_cdf <= @max_cscore")
210
219
  .reset_index(drop=True)
211
220
  )
212
221
 
@@ -221,4 +230,4 @@ def s2_cloud_table(
221
230
  "collection": collection
222
231
  }
223
232
  )
224
- return result
233
+ return result
@@ -16,17 +16,22 @@ from __future__ import annotations
16
16
  import pathlib
17
17
  import concurrent.futures
18
18
  from typing import Dict, Any
19
-
20
19
  import ee
20
+
21
+
21
22
  from cubexpress.downloader import download_manifest, download_manifests
22
23
  from cubexpress.geospatial import quadsplit_manifest, calculate_cell_size
23
- from cubexpress.geotyping import RequestSet
24
+ from cubexpress.request import table_to_requestset
25
+ import pandas as pd
24
26
 
25
27
 
26
28
  def get_geotiff(
27
29
  manifest: Dict[str, Any],
28
30
  full_outname: pathlib.Path | str,
31
+ join: bool = True,
32
+ eraser: bool = True,
29
33
  nworks: int = 4,
34
+ verbose: bool = True,
30
35
  ) -> None:
31
36
  """Download *manifest* to *full_outname*, retrying with tiled requests.
32
37
 
@@ -39,19 +44,26 @@ def get_geotiff(
39
44
  nworks
40
45
  Maximum worker threads when the image must be split; default **4**.
41
46
  """
47
+ full_outname = pathlib.Path(full_outname)
42
48
  try:
43
- download_manifest(manifest, pathlib.Path(full_outname))
49
+ download_manifest(manifest, full_outname)
44
50
  except ee.ee_exception.EEException as err:
45
- # Handle EE “too many pixels” error by recursive tiling.
51
+
46
52
  size = manifest["grid"]["dimensions"]["width"] # square images assumed
47
53
  cell_w, cell_h, power = calculate_cell_size(str(err), size)
48
54
  tiled = quadsplit_manifest(manifest, cell_w, cell_h, power)
49
- download_manifests(tiled, max_workers=nworks, full_outname=pathlib.Path(full_outname))
55
+ download_manifests(tiled, full_outname, join, eraser, nworks)
56
+
57
+ if verbose:
58
+ print(f"Downloaded {full_outname}")
50
59
 
51
60
 
52
61
  def get_cube(
53
- requests: RequestSet,
62
+ table: pd.DataFrame,
54
63
  outfolder: pathlib.Path | str,
64
+ join: bool = True,
65
+ eraser: bool = True,
66
+ mosaic: bool = True,
55
67
  nworks: int = 4,
56
68
  ) -> None:
57
69
  """Download every request in *requests* to *outfolder* using a thread pool.
@@ -68,14 +80,18 @@ def get_cube(
68
80
  nworks
69
81
  Pool size for concurrent downloads; default **4**.
70
82
  """
71
- out = pathlib.Path(outfolder)
83
+
84
+ requests = table_to_requestset(
85
+ table=table,
86
+ mosaic=mosaic
87
+ )
72
88
 
73
89
  with concurrent.futures.ThreadPoolExecutor(max_workers=nworks) as pool:
74
90
  futures = []
75
91
  for _, row in requests._dataframe.iterrows():
76
- outname = out / f"{row.id}.tif"
92
+ outname = pathlib.Path(outfolder) / f"{row.id}.tif"
77
93
  outname.parent.mkdir(parents=True, exist_ok=True)
78
- futures.append(pool.submit(get_geotiff, row.manifest, outname, nworks))
94
+ futures.append(pool.submit(get_geotiff, row.manifest, outname, join, eraser, nworks))
79
95
 
80
96
  for fut in concurrent.futures.as_completed(futures):
81
97
  try:
@@ -21,7 +21,10 @@ import ee
21
21
  import rasterio as rio
22
22
  from rasterio.io import MemoryFile
23
23
  import logging
24
+ from rasterio.merge import merge
25
+ from rasterio.enums import Resampling
24
26
  import os
27
+ import shutil
25
28
 
26
29
  os.environ['CPL_LOG_ERRORS'] = 'OFF'
27
30
  logging.getLogger('rasterio._env').setLevel(logging.ERROR)
@@ -53,7 +56,7 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
53
56
  blockxsize=256,
54
57
  blockysize=256,
55
58
  compress="ZSTD",
56
- zstd_level=13,
59
+ # zstd_level=13,
57
60
  predictor=2,
58
61
  num_threads=20,
59
62
  nodata=65535,
@@ -65,13 +68,12 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
65
68
  with rio.open(full_outname, "w", **profile) as dst:
66
69
  dst.write(src.read())
67
70
 
68
- print(f"{full_outname} downloaded successfully.") # noqa: T201
69
-
70
-
71
71
  def download_manifests(
72
72
  manifests: List[Dict[str, Any]],
73
- max_workers: int,
74
73
  full_outname: pathlib.Path,
74
+ join: bool = True,
75
+ eraser: bool = True,
76
+ max_workers: int = 4,
75
77
  ) -> None:
76
78
  """Download every manifest in *manifests* concurrently.
77
79
 
@@ -93,3 +95,41 @@ def download_manifests(
93
95
  fut.result()
94
96
  except Exception as exc: # noqa: BLE001
95
97
  print(f"Error en una de las descargas: {exc}") # noqa: T201
98
+
99
+ if join:
100
+
101
+ dir_path = full_outname.parent / full_outname.stem
102
+ input_files = sorted(dir_path.glob("*.tif"))
103
+
104
+ if dir_path.exists() and len(input_files) > 1:
105
+
106
+ with rio.Env(GDAL_NUM_THREADS="8", NUM_THREADS="8"):
107
+ srcs = [rio.open(fp) for fp in input_files]
108
+ mosaic, out_transform = merge(
109
+ srcs,
110
+ nodata=65535,
111
+ resampling=Resampling.nearest
112
+ )
113
+
114
+ meta = srcs[0].profile.copy()
115
+ meta["transform"] = out_transform
116
+ meta.update(
117
+ height=mosaic.shape[1],
118
+ width=mosaic.shape[2]
119
+ )
120
+
121
+ with rio.open(full_outname, "w", **meta) as dst:
122
+ dst.write(mosaic)
123
+
124
+ for src in srcs:
125
+ src.close()
126
+
127
+ if eraser:
128
+ # Delete a folder with pathlib
129
+ shutil.rmtree(dir_path)
130
+
131
+ print("✅ Mosaico generado:", full_outname)
132
+ return full_outname
133
+
134
+ else:
135
+ return full_outname
@@ -482,7 +482,7 @@ class RequestSet(BaseModel):
482
482
  str: A string representation of the entire RasterTransformSet.
483
483
  """
484
484
  num_entries = len(self.requestset)
485
- return f"RasterTransformSet({num_entries} entries)"
485
+ return f"RequestSet({num_entries} entries)"
486
486
 
487
487
  def __str__(self):
488
488
  return super().__repr__()
@@ -5,13 +5,15 @@ from __future__ import annotations
5
5
  import ee
6
6
  import pandas as pd
7
7
  import pygeohash as pgh
8
- from typing import List
9
8
 
10
9
  from cubexpress.geotyping import Request, RequestSet
11
10
  from cubexpress.conversion import lonlat2rt
12
11
 
13
12
 
14
- def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
13
+ def table_to_requestset(
14
+ table: pd.DataFrame,
15
+ mosaic: bool = True
16
+ ) -> RequestSet:
15
17
  """Return a :class:`RequestSet` built from *df* (cloud_table result).
16
18
 
17
19
  Parameters
@@ -31,30 +33,30 @@ def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
31
33
  """
32
34
 
33
35
 
34
- df_ = df.copy()
36
+ df = table.copy()
35
37
 
36
- if df_.empty:
38
+ if df.empty:
37
39
  raise ValueError("cloud_table returned no rows; nothing to request.")
38
40
 
39
41
  rt = lonlat2rt(
40
- lon=df_.attrs["lon"],
41
- lat=df_.attrs["lat"],
42
- edge_size=df_.attrs["edge_size"],
43
- scale=df_.attrs["scale"],
42
+ lon=df.attrs["lon"],
43
+ lat=df.attrs["lat"],
44
+ edge_size=df.attrs["edge_size"],
45
+ scale=df.attrs["scale"],
44
46
  )
45
- centre_hash = pgh.encode(df_.attrs["lat"], df_.attrs["lon"], precision=5)
47
+ centre_hash = pgh.encode(df.attrs["lat"], df.attrs["lon"], precision=5)
46
48
  reqs: list[Request] = []
47
49
 
48
50
  if mosaic:
49
51
  # group all asset IDs per day
50
52
  grouped = (
51
- df_.groupby("date")["id"] # Series con listas de ids por día
53
+ df.groupby("date")["id"] # Series con listas de ids por día
52
54
  .apply(list)
53
55
  )
54
56
 
55
57
  for day, img_ids in grouped.items():
56
58
  ee_img = ee.ImageCollection(
57
- [ee.Image(f"{df_.attrs['collection']}/{img}") for img in img_ids]
59
+ [ee.Image(f"{df.attrs['collection']}/{img}") for img in img_ids]
58
60
  ).mosaic()
59
61
 
60
62
  reqs.append(
@@ -62,11 +64,11 @@ def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
62
64
  id=f"{day}_{centre_hash}",
63
65
  raster_transform=rt,
64
66
  image=ee_img,
65
- bands=df_.attrs["bands"],
67
+ bands=df.attrs["bands"],
66
68
  )
67
69
  )
68
70
  else: # one request per asset
69
- for _, row in df_.iterrows():
71
+ for _, row in df.iterrows():
70
72
  img_id = row["id"]
71
73
  day = row["date"]
72
74
 
@@ -74,8 +76,8 @@ def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
74
76
  Request(
75
77
  id=f"{day}_{centre_hash}_{img_id}",
76
78
  raster_transform=rt,
77
- image=f"{df_.attrs['collection']}/{img_id}",
78
- bands=df_.attrs["bands"],
79
+ image=f"{df.attrs['collection']}/{img_id}",
80
+ bands=df.attrs["bands"],
79
81
  )
80
82
  )
81
83
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cubexpress"
3
- version = "0.1.8"
3
+ version = "0.1.9"
4
4
  description = "Efficient processing of cubic Earth-observation (EO) data."
5
5
  authors = [
6
6
  "Julio Contreras <contrerasnetk@gmail.com>",
File without changes
File without changes