PyPI - cubexpress - Versions diffs - 0.1.7__tar.gz → 0.1.9__tar.gz - Mend

cubexpress 0.1.7tar.gz → 0.1.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cubexpress might be problematic. Click here for more details.

Files changed (13) hide show

{cubexpress-0.1.7 → cubexpress-0.1.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cubexpress
-Version: 0.1.7
+Version: 0.1.9
 Summary: Efficient processing of cubic Earth-observation (EO) data.
 Home-page: https://github.com/andesdatacube/cubexpress
 License: MIT

{cubexpress-0.1.7 → cubexpress-0.1.9}/cubexpress/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from cubexpress.conversion import lonlat2rt, geo2utm
 from cubexpress.geotyping import RasterTransform, Request, RequestSet
-from cubexpress.cloud_utils import cloud_table
+from cubexpress.cloud_utils import s2_cloud_table
 from cubexpress.cube import get_cube
 from cubexpress.request import table_to_requestset
@@ -15,7 +15,7 @@ __all__ = [
     "RequestSet",
     "geo2utm",
     "get_cube",
-    "cloud_table",
+    "s2_cloud_table",
     "table_to_requestset"
 ]

{cubexpress-0.1.7 → cubexpress-0.1.9}/cubexpress/cloud_utils.py RENAMED Viewed

@@ -13,10 +13,6 @@ Both return a ``pandas.DataFrame`` with the columns **day**, **cloudPct** and
 from __future__ import annotations
 import datetime as dt
-import json
-import pathlib
-from typing import List, Optional
 import ee
 import pandas as pd
@@ -28,102 +24,97 @@ def _cloud_table_single_range(
     lon: float,
     lat: float,
     edge_size: int,
-    scale: int,
     start: str,
-    end: str,
-    collection: str = "COPERNICUS/S2_HARMONIZED",
+    end: str
 ) -> pd.DataFrame:
-    """Return raw cloud-table rows for a single *start–end* interval.
+    """
+    Build a daily cloud-score table for a square Sentinel-2 footprint.
     Parameters
     ----------
-    lon, lat
-        Centre coordinates in decimal degrees.
-    edge_size, scale
-        ROI size in pixels (*edge_size*) and pixel resolution in metres
-        (*scale*), fed into :pyfunc:`cubexpress.geospatial._square_roi`.
-    start, end
-        ISO-dates (``YYYY-MM-DD``) delimiting the query.
-    collection
-        Sentinel-2 collection name to query.
+    lon, lat : float
+        Point at the centre of the requested region (°).
+    edge_size : int
+        Side length of the square region in Sentinel-2 pixels (10 m each).
+    start, end : str
+        ISO-8601 dates delimiting the period, e.g. ``"2024-06-01"``.
     Returns
     -------
     pandas.DataFrame
-        Columns: **day** (str), **cloudPct** (float), **images** (str
-        concatenation of asset IDs separated by ``-``). No filtering applied.
+        One row per image with columns:
+        * ``id`` – Sentinel-2 ID
+        * ``cs_cdf`` – Cloud Score Plus CDF (0–1)
+        * ``date`` – acquisition date (YYYY-MM-DD)
+        * ``high_null_flag`` – 1 if cloud score missing
+    Notes
+    -----
+    Missing ``cs_cdf`` values are filled with the mean of the same day.
     """
-    roi = _square_roi(lon, lat, edge_size, scale)
-    s2 = ee.ImageCollection(collection)
-    if collection in (
-        "COPERNICUS/S2_HARMONIZED",
-        "COPERNICUS/S2_SR_HARMONIZED",
-    ):
-        qa_band = "cs_cdf"
-        csp = ee.ImageCollection("GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED")
-    else:
-        qa_band, csp = None, None
-    def _add_props(img):
-        day = ee.Date(img.get("system:time_start")).format("YYYY-MM-dd")
-        imgid = img.get("system:index")
-        if qa_band:
-            score = (
-                img.linkCollection(csp, [qa_band])
-                .select([qa_band])
-                .reduceRegion(ee.Reducer.mean(), roi, scale)
-                .get(qa_band)
-            )
-            # If score is null assume completely clear (score=1 → cloudPct=0)
-            score_safe = ee.Algorithms.If(score, score, -1)
-            cloud_pct = (
-                ee.Number(1)
-                .subtract(ee.Number(score_safe))
-                .multiply(10000)
-                .round()
-                .divide(100)
-            )
-        else:
-            cloud_pct = ee.Number(-1)
-        return ee.Feature(
-            None,
-            {
-                "day": day,
-                "cloudPct": cloud_pct,
-                "images": imgid,
-            },
-        )
-    triples = (
-        s2.filterDate(start, end)
+    center = ee.Geometry.Point([lon, lat])
+    roi = _square_roi(lon, lat, edge_size, 10)
+    s2 = (
+        ee.ImageCollection("COPERNICUS/S2_HARMONIZED")
         .filterBounds(roi)
-        .map(_add_props)
-        .reduceColumns(ee.Reducer.toList(3), ["day", "cloudPct", "images"])
-        .get("list")
-        .getInfo()
+        .filterDate(start, end)
+    )
+    csp = ee.ImageCollection("GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED")
+    ic = (
+        s2
+        .linkCollection(csp, ["cs_cdf"])
+        .select(["cs_cdf"])
     )
-    df = pd.DataFrame(triples, columns=["day", "cloudPct", "images"]).dropna()
-    df["cloudPct"] = df["cloudPct"].astype(float)
-    df["images"] = df["images"].astype(str)
+    # image IDs for every expected date
+    ids = ic.aggregate_array("system:index").getInfo()
+    df_ids = pd.DataFrame({"id": ids})
+    region_scale = edge_size * 10 / 2
+    try:
+        raw = ic.getRegion(geometry=center, scale=region_scale).getInfo()
+    except ee.ee_exception.EEException as e:
+        if "No bands in collection" in str(e):
+            return pd.DataFrame(
+                columns=["id", "cs_cdf", "date", "high_null_flag"]
+            )
+        raise
+    df_raw = pd.DataFrame(raw[1:], columns=raw[0])
+    df = (
+        df_ids
+        .merge(df_raw, on="id", how="left")
+        .assign(
+            date=lambda d: pd.to_datetime(d["id"].str[:8], format="%Y%m%d").dt.strftime("%Y-%m-%d"),
+            high_null_flag=lambda d: d["cs_cdf"].isna().astype(int),
+        )
+        .drop(columns=["longitude", "latitude", "time"])
+    )
+    # fill missing scores with daily mean
+    df["cs_cdf"] = df["cs_cdf"].fillna(df.groupby("date")["cs_cdf"].transform("mean"))
     return df
-def cloud_table(
+def s2_cloud_table(
     lon: float,
     lat: float,
-    edge_size: int = 2048,
-    scale: int = 10,
-    start: str = "2017-01-01",
-    end: str = "2024-12-31",
-    cloud_max: float = 7.0,
-    bands: Optional[List[str]] = None,
-    collection: str = "COPERNICUS/S2_HARMONIZED",
-    output_path: str | pathlib.Path | None = None,
-    cache: bool = True,
+    edge_size: int,
+    start: str,
+    end: str,
+    max_cscore: float = 1.0,
+    min_cscore: float = 0.0,
+    cache: bool = False,
     verbose: bool = True,
 ) -> pd.DataFrame:
     """Build (and cache) a per-day cloud-table for the requested ROI.
@@ -161,23 +152,10 @@ def cloud_table(
     pandas.DataFrame
         Filtered cloud table with ``.attrs`` containing the call parameters.
     """
-    if bands is None:
-        bands = [
-            "B1",
-            "B2",
-            "B3",
-            "B4",
-            "B5",
-            "B6",
-            "B7",
-            "B8",
-            "B8A",
-            "B9",
-            "B10",
-            "B11",
-            "B12",
-        ]
+    bands = ["B1", "B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B9", "B10", "B11", "B12"]
+    collection = "COPERNICUS/S2_HARMONIZED"
+    scale = 10
     cache_file = _cache_key(lon, lat, edge_size, scale, collection)
     # ─── 1. Load cached data if present ────────────────────────────────────
@@ -185,7 +163,7 @@ def cloud_table(
         if verbose:
             print("📂  Loading cached table …")
         df_cached = pd.read_parquet(cache_file)
-        have_idx = pd.to_datetime(df_cached["day"], errors="coerce").dropna()
+        have_idx = pd.to_datetime(df_cached["date"], errors="coerce").dropna()
         cached_start = have_idx.min().date()
         cached_end = have_idx.max().date()
@@ -204,39 +182,40 @@ def cloud_table(
                 a1, b1 = start, cached_start.isoformat()
                 df_new_parts.append(
                     _cloud_table_single_range(
-                        lon, lat, edge_size, scale, a1, b1, collection
+                        lon, lat, edge_size, a1, b1
                     )
                 )
             if dt.date.fromisoformat(end) > cached_end:
                 a2, b2 = cached_end.isoformat(), end
                 df_new_parts.append(
                     _cloud_table_single_range(
-                        lon, lat, edge_size, scale, a2, b2, collection
+                        lon, lat, edge_size, a2, b2
                     )
                 )
             df_new = pd.concat(df_new_parts, ignore_index=True)
             df_full = (
                 pd.concat([df_cached, df_new], ignore_index=True)
-                .drop_duplicates("day")
-                .sort_values("day", kind="mergesort")
+                .sort_values("date", kind="mergesort")
             )
     else:
-        # No cache or caching disabled: fetch full range.
         if verbose:
             msg = "Generating table (no cache found)…" if cache else "Generating table…"
             print("⏳", msg)
         df_full = _cloud_table_single_range(
-            lon, lat, edge_size, scale, start, end, collection
+            lon, lat, edge_size, start, end
         )
     # ─── 2. Save cache ─────────────────────────────────────────────────────
     if cache:
         df_full.to_parquet(cache_file, compression="zstd")
     # ─── 3. Filter by cloud cover and requested date window ────────────────
     result = (
-        df_full.query("@start <= day <= @end")
-        .query("cloudPct < @cloud_max")
+        df_full.query("@start <= date <= @end")
+        .query("@min_cscore <= cs_cdf <= @max_cscore")
         .reset_index(drop=True)
     )
@@ -248,9 +227,7 @@ def cloud_table(
             "edge_size": edge_size,
             "scale": scale,
             "bands": bands,
-            "collection": collection,
-            "cloud_max": cloud_max,
-            "output_path": str(output_path) if output_path else "",
+            "collection": collection
         }
     )
-    return result
+    return result

{cubexpress-0.1.7 → cubexpress-0.1.9}/cubexpress/cube.py RENAMED Viewed

@@ -16,17 +16,22 @@ from __future__ import annotations
 import pathlib
 import concurrent.futures
 from typing import Dict, Any
 import ee
 from cubexpress.downloader import download_manifest, download_manifests
 from cubexpress.geospatial import quadsplit_manifest, calculate_cell_size
-from cubexpress.geotyping import RequestSet
+from cubexpress.request import table_to_requestset
+import pandas as pd
 def get_geotiff(
     manifest: Dict[str, Any],
     full_outname: pathlib.Path | str,
+    join: bool = True,
+    eraser: bool = True,
     nworks: int = 4,
+    verbose: bool = True,
 ) -> None:
     """Download *manifest* to *full_outname*, retrying with tiled requests.
@@ -39,19 +44,26 @@ def get_geotiff(
     nworks
         Maximum worker threads when the image must be split; default **4**.
     """
+    full_outname = pathlib.Path(full_outname)
     try:
-        download_manifest(manifest, pathlib.Path(full_outname))
+        download_manifest(manifest, full_outname)
     except ee.ee_exception.EEException as err:
-        # Handle EE “too many pixels” error by recursive tiling.
         size = manifest["grid"]["dimensions"]["width"]  # square images assumed
         cell_w, cell_h, power = calculate_cell_size(str(err), size)
         tiled = quadsplit_manifest(manifest, cell_w, cell_h, power)
-        download_manifests(tiled, max_workers=nworks, full_outname=pathlib.Path(full_outname))
+        download_manifests(tiled, full_outname, join, eraser, nworks)
+    if verbose:
+        print(f"Downloaded {full_outname}")
 def get_cube(
-    requests: RequestSet,
+    table: pd.DataFrame,
     outfolder: pathlib.Path | str,
+    join: bool = True,
+    eraser: bool = True,
+    mosaic: bool = True,
     nworks: int = 4,
 ) -> None:
     """Download every request in *requests* to *outfolder* using a thread pool.
@@ -68,14 +80,18 @@ def get_cube(
     nworks
         Pool size for concurrent downloads; default **4**.
     """
-    out = pathlib.Path(outfolder)
+    requests = table_to_requestset(
+        table=table,
+        mosaic=mosaic
+    )
     with concurrent.futures.ThreadPoolExecutor(max_workers=nworks) as pool:
         futures = []
         for _, row in requests._dataframe.iterrows():
-            outname = out / f"{row.id}.tif"
+            outname = pathlib.Path(outfolder) / f"{row.id}.tif"
             outname.parent.mkdir(parents=True, exist_ok=True)
-            futures.append(pool.submit(get_geotiff, row.manifest, outname, nworks))
+            futures.append(pool.submit(get_geotiff, row.manifest, outname, join, eraser, nworks))
         for fut in concurrent.futures.as_completed(futures):
             try:

{cubexpress-0.1.7 → cubexpress-0.1.9}/cubexpress/downloader.py RENAMED Viewed

@@ -21,7 +21,10 @@ import ee
 import rasterio as rio
 from rasterio.io import MemoryFile
 import logging
+from rasterio.merge import merge
+from rasterio.enums import Resampling
 import os
+import shutil
 os.environ['CPL_LOG_ERRORS'] = 'OFF'
 logging.getLogger('rasterio._env').setLevel(logging.ERROR)
@@ -53,7 +56,7 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
                 blockxsize=256,
                 blockysize=256,
                 compress="ZSTD",
-                zstd_level=13,
+                # zstd_level=13,
                 predictor=2,
                 num_threads=20,
                 nodata=65535,
@@ -65,13 +68,12 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
             with rio.open(full_outname, "w", **profile) as dst:
                 dst.write(src.read())
-    print(f"{full_outname} downloaded successfully.")  # noqa: T201
 def download_manifests(
     manifests: List[Dict[str, Any]],
-    max_workers: int,
     full_outname: pathlib.Path,
+    join: bool = True,
+    eraser: bool = True,
+    max_workers: int = 4,
 ) -> None:
     """Download every manifest in *manifests* concurrently.
@@ -93,3 +95,41 @@ def download_manifests(
                 fut.result()
             except Exception as exc:  # noqa: BLE001
                 print(f"Error en una de las descargas: {exc}")  # noqa: T201
+    if join:
+        dir_path = full_outname.parent / full_outname.stem
+        input_files = sorted(dir_path.glob("*.tif"))
+        if dir_path.exists() and len(input_files) > 1:
+            with rio.Env(GDAL_NUM_THREADS="8", NUM_THREADS="8"):
+                srcs = [rio.open(fp) for fp in input_files]
+                mosaic, out_transform = merge(
+                    srcs,
+                    nodata=65535,
+                    resampling=Resampling.nearest
+                )
+                meta = srcs[0].profile.copy()
+                meta["transform"] = out_transform
+                meta.update(
+                    height=mosaic.shape[1],
+                    width=mosaic.shape[2]
+                )
+                with rio.open(full_outname, "w", **meta) as dst:
+                    dst.write(mosaic)
+                for src in srcs:
+                    src.close()
+            if eraser:
+                # Delete a folder with pathlib
+                shutil.rmtree(dir_path)
+            print("✅ Mosaico generado:", full_outname)
+            return full_outname
+        else:
+            return full_outname

{cubexpress-0.1.7 → cubexpress-0.1.9}/cubexpress/geotyping.py RENAMED Viewed

@@ -482,7 +482,7 @@ class RequestSet(BaseModel):
             str: A string representation of the entire RasterTransformSet.
         """
         num_entries = len(self.requestset)
-        return f"RasterTransformSet({num_entries} entries)"
+        return f"RequestSet({num_entries} entries)"
     def __str__(self):
         return super().__repr__()

{cubexpress-0.1.7 → cubexpress-0.1.9}/cubexpress/request.py RENAMED Viewed

@@ -5,13 +5,15 @@ from __future__ import annotations
 import ee
 import pandas as pd
 import pygeohash as pgh
-from typing import List
 from cubexpress.geotyping import Request, RequestSet
 from cubexpress.conversion import lonlat2rt
-def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
+def table_to_requestset(
+        table: pd.DataFrame,
+        mosaic: bool = True
+    ) -> RequestSet:
     """Return a :class:`RequestSet` built from *df* (cloud_table result).
     Parameters
@@ -29,6 +31,10 @@ def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
         If *df* is empty after filtering.
     """
+    df = table.copy()
     if df.empty:
         raise ValueError("cloud_table returned no rows; nothing to request.")
@@ -39,24 +45,23 @@ def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
         scale=df.attrs["scale"],
     )
     centre_hash = pgh.encode(df.attrs["lat"], df.attrs["lon"], precision=5)
-    reqs: List[Request] = []
+    reqs: list[Request] = []
     if mosaic:
         # group all asset IDs per day
         grouped = (
-            df.assign(img=lambda x: x.images.str.split("-"))
-              .explode("img")
-              .groupby("day")["img"]
-              .apply(list)
+            df.groupby("date")["id"]   # Series con listas de ids por día
+            .apply(list)
         )
         for day, img_ids in grouped.items():
             ee_img = ee.ImageCollection(
                 [ee.Image(f"{df.attrs['collection']}/{img}") for img in img_ids]
             ).mosaic()
             reqs.append(
                 Request(
-                    id=f"{day}_{centre_hash}_mosaic",
+                    id=f"{day}_{centre_hash}",
                     raster_transform=rt,
                     image=ee_img,
                     bands=df.attrs["bands"],
@@ -64,14 +69,16 @@ def table_to_requestset(df: pd.DataFrame, *, mosaic: bool = True) -> RequestSet:
             )
     else:  # one request per asset
         for _, row in df.iterrows():
-            for img_id in row["images"].split("-"):
-                reqs.append(
-                    Request(
-                        id=f"{row['day']}_{centre_hash}_{img_id}",
-                        raster_transform=rt,
-                        image=f"{df.attrs['collection']}/{img_id}",
-                        bands=df.attrs["bands"],
-                    )
+            img_id = row["id"]
+            day    = row["date"]
+            reqs.append(
+                Request(
+                    id=f"{day}_{centre_hash}_{img_id}",
+                    raster_transform=rt,
+                    image=f"{df.attrs['collection']}/{img_id}",
+                    bands=df.attrs["bands"],
                 )
+            )
     return RequestSet(requestset=reqs)

{cubexpress-0.1.7 → cubexpress-0.1.9}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name        = "cubexpress"
-version     = "0.1.7"
+version     = "0.1.9"
 description = "Efficient processing of cubic Earth-observation (EO) data."
 authors     = [
   "Julio Contreras <contrerasnetk@gmail.com>",

{cubexpress-0.1.7 → cubexpress-0.1.9}/LICENSE RENAMED Viewed

File without changes

{cubexpress-0.1.7 → cubexpress-0.1.9}/README.md RENAMED Viewed

File without changes

{cubexpress-0.1.7 → cubexpress-0.1.9}/cubexpress/cache.py RENAMED Viewed

File without changes

{cubexpress-0.1.7 → cubexpress-0.1.9}/cubexpress/conversion.py RENAMED Viewed

File without changes

{cubexpress-0.1.7 → cubexpress-0.1.9}/cubexpress/geospatial.py RENAMED Viewed

File without changes

cubexpress 0.1.7__tar.gz → 0.1.9__tar.gz

Potentially problematic release.

cubexpress 0.1.7tar.gz → 0.1.9tar.gz