PyPI - cubexpress - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

cubexpress 0.1.9py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

cubexpress/cloud_utils.py +19 -12
cubexpress/cube.py +24 -5
cubexpress/downloader.py +37 -37
cubexpress/geotyping.py +5 -15
cubexpress/request.py +42 -19
{cubexpress-0.1.9.dist-info → cubexpress-0.1.10.dist-info}/METADATA +2 -1
cubexpress-0.1.10.dist-info/RECORD +13 -0
cubexpress-0.1.9.dist-info/RECORD +0 -13
{cubexpress-0.1.9.dist-info → cubexpress-0.1.10.dist-info}/LICENSE +0 -0
{cubexpress-0.1.9.dist-info → cubexpress-0.1.10.dist-info}/WHEEL +0 -0

cubexpress/cloud_utils.py CHANGED Viewed

@@ -46,7 +46,7 @@ def _cloud_table_single_range(
         * ``id`` – Sentinel-2 ID
         * ``cs_cdf`` – Cloud Score Plus CDF (0–1)
         * ``date`` – acquisition date (YYYY-MM-DD)
-        * ``high_null_flag`` – 1 if cloud score missing
+        * ``null_flag`` – 1 if cloud score missing
     Notes
     -----
@@ -83,7 +83,7 @@ def _cloud_table_single_range(
     except ee.ee_exception.EEException as e:
         if "No bands in collection" in str(e):
             return pd.DataFrame(
-                columns=["id", "cs_cdf", "date", "high_null_flag"]
+                columns=["id", "cs_cdf", "date", "null_flag"]
             )
         raise
@@ -95,7 +95,7 @@ def _cloud_table_single_range(
         .merge(df_raw, on="id", how="left")
         .assign(
             date=lambda d: pd.to_datetime(d["id"].str[:8], format="%Y%m%d").dt.strftime("%Y-%m-%d"),
-            high_null_flag=lambda d: d["cs_cdf"].isna().astype(int),
+            null_flag=lambda d: d["cs_cdf"].isna().astype(int),
         )
         .drop(columns=["longitude", "latitude", "time"])
     )
@@ -161,7 +161,7 @@ def s2_cloud_table(
     # ─── 1. Load cached data if present ────────────────────────────────────
     if cache and cache_file.exists():
         if verbose:
-            print("📂  Loading cached table …")
+            print("📂  Loading cached metadata …")
         df_cached = pd.read_parquet(cache_file)
         have_idx = pd.to_datetime(df_cached["date"], errors="coerce").dropna()
@@ -173,7 +173,7 @@ def s2_cloud_table(
             and dt.date.fromisoformat(end) <= cached_end
         ):
             if verbose:
-                print("✅  Served entirely from cache.")
+                print("✅  Served entirely from metadata.")
             df_full = df_cached
         else:
             # Identify missing segments and fetch only those.
@@ -192,15 +192,21 @@ def s2_cloud_table(
                         lon, lat, edge_size, a2, b2
                     )
                 )
-            df_new = pd.concat(df_new_parts, ignore_index=True)
-            df_full = (
-                pd.concat([df_cached, df_new], ignore_index=True)
-                .sort_values("date", kind="mergesort")
-            )
+            df_new_parts = [df for df in df_new_parts if not df.empty]
+            if df_new_parts:
+                df_new = pd.concat(df_new_parts, ignore_index=True)
+                df_full = (
+                    pd.concat([df_cached, df_new], ignore_index=True)
+                    .sort_values("date", kind="mergesort")
+                )
+            else:
+                df_full = df_cached
     else:
         if verbose:
-            msg = "Generating table (no cache found)…" if cache else "Generating table…"
+            msg = "Generating metadata (no cache found)…" if cache else "Generating metadata…"
             print("⏳", msg)
         df_full = _cloud_table_single_range(
             lon, lat, edge_size, start, end
@@ -230,4 +236,5 @@ def s2_cloud_table(
             "collection": collection
         }
     )
-    return result
+    return result

cubexpress/cube.py CHANGED Viewed

@@ -29,7 +29,6 @@ def get_geotiff(
     manifest: Dict[str, Any],
     full_outname: pathlib.Path | str,
     join: bool = True,
-    eraser: bool = True,
     nworks: int = 4,
     verbose: bool = True,
 ) -> None:
@@ -52,7 +51,7 @@ def get_geotiff(
         size = manifest["grid"]["dimensions"]["width"]  # square images assumed
         cell_w, cell_h, power = calculate_cell_size(str(err), size)
         tiled = quadsplit_manifest(manifest, cell_w, cell_h, power)
-        download_manifests(tiled, full_outname, join, eraser, nworks)
+        download_manifests(tiled, full_outname, join, nworks)
     if verbose:
         print(f"Downloaded {full_outname}")
@@ -61,10 +60,11 @@ def get_geotiff(
 def get_cube(
     table: pd.DataFrame,
     outfolder: pathlib.Path | str,
-    join: bool = True,
-    eraser: bool = True,
     mosaic: bool = True,
+    join: bool = True,
     nworks: int = 4,
+    verbose: bool = True,
+    cache: bool = True
 ) -> None:
     """Download every request in *requests* to *outfolder* using a thread pool.
@@ -85,16 +85,35 @@ def get_cube(
         table=table,
         mosaic=mosaic
     )
+    outfolder = pathlib.Path(outfolder).expanduser().resolve()
     with concurrent.futures.ThreadPoolExecutor(max_workers=nworks) as pool:
         futures = []
         for _, row in requests._dataframe.iterrows():
             outname = pathlib.Path(outfolder) / f"{row.id}.tif"
+            if outname.exists() and cache:
+                continue
             outname.parent.mkdir(parents=True, exist_ok=True)
-            futures.append(pool.submit(get_geotiff, row.manifest, outname, join, eraser, nworks))
+            futures.append(
+                pool.submit(
+                    get_geotiff,
+                    row.manifest,
+                    outname,
+                    join,
+                    nworks,
+                    verbose
+                )
+            )
         for fut in concurrent.futures.as_completed(futures):
             try:
                 fut.result()
             except Exception as exc:  # noqa: BLE001 – log and keep going
                 print(f"Download error: {exc}")
+    download_df = requests._dataframe[["outname", "cs_cdf", "date"]].copy()
+    download_df["outname"] = outfolder / requests._dataframe["outname"]
+    download_df.rename(columns={"outname": "full_outname"}, inplace=True)
+    return download_df

cubexpress/downloader.py CHANGED Viewed

@@ -25,6 +25,7 @@ from rasterio.merge import merge
 from rasterio.enums import Resampling
 import os
 import shutil
+import tempfile
 os.environ['CPL_LOG_ERRORS'] = 'OFF'
 logging.getLogger('rasterio._env').setLevel(logging.ERROR)
@@ -53,7 +54,7 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
                 driver="GTiff",
                 tiled=True,
                 interleave="band",
-                blockxsize=256,
+                blockxsize=256, # TODO: Creo que es 128 (por de la superresolucion)
                 blockysize=256,
                 compress="ZSTD",
                 # zstd_level=13,
@@ -69,10 +70,9 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
                 dst.write(src.read())
 def download_manifests(
-    manifests: List[Dict[str, Any]],
+    manifests: list[Dict[str, Any]],
     full_outname: pathlib.Path,
     join: bool = True,
-    eraser: bool = True,
     max_workers: int = 4,
 ) -> None:
     """Download every manifest in *manifests* concurrently.
@@ -81,6 +81,12 @@ def download_manifests(
     ``full_outname.parent/full_outname.stem`` with names ``000000.tif``,
     ``000001.tif`` … according to the list order.
     """
+    # full_outname = pathlib.Path("/home/contreras/Documents/GitHub/cubexpress/cubexpress_test/2017-08-19_6mfrw_18LVN.tif")
+    original_dir = full_outname.parent
+    if join:
+        tmp_dir = pathlib.Path(tempfile.mkdtemp(prefix="s2tmp_"))
+        full_outname = tmp_dir / full_outname.name
     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = []
@@ -95,41 +101,35 @@ def download_manifests(
                 fut.result()
             except Exception as exc:  # noqa: BLE001
                 print(f"Error en una de las descargas: {exc}")  # noqa: T201
-    if join:
-        dir_path = full_outname.parent / full_outname.stem
-        input_files = sorted(dir_path.glob("*.tif"))
-        if dir_path.exists() and len(input_files) > 1:
-            with rio.Env(GDAL_NUM_THREADS="8", NUM_THREADS="8"):
-                srcs = [rio.open(fp) for fp in input_files]
-                mosaic, out_transform = merge(
-                    srcs,
-                    nodata=65535,
-                    resampling=Resampling.nearest
-                )
-                meta = srcs[0].profile.copy()
-                meta["transform"] = out_transform
-                meta.update(
-                    height=mosaic.shape[1],
-                    width=mosaic.shape[2]
-                )
+    dir_path = full_outname.parent / full_outname.stem
+    input_files = sorted(dir_path.glob("*.tif"))
-                with rio.open(full_outname, "w", **meta) as dst:
-                    dst.write(mosaic)
+    if dir_path.exists() and len(input_files) > 1:
-                for src in srcs:
-                    src.close()
-            if eraser:
-                # Delete a folder with pathlib
-                shutil.rmtree(dir_path)
-            print("✅ Mosaico generado:", full_outname)
-            return full_outname
+        with rio.Env(GDAL_NUM_THREADS="8", NUM_THREADS="8"):
+            srcs = [rio.open(fp) for fp in input_files]
+            mosaic, out_transform = merge(
+                srcs,
+                nodata=65535,
+                resampling=Resampling.nearest
+            )
-        else:
-            return full_outname
+            meta = srcs[0].profile.copy()
+            meta["transform"] = out_transform
+            meta.update(
+                height=mosaic.shape[1],
+                width=mosaic.shape[2]
+            )
+            outname = original_dir / full_outname.name
+            outname.parent.mkdir(parents=True, exist_ok=True)
+            with rio.open(outname, "w", **meta) as dst:
+                dst.write(mosaic)
+            for src in srcs:
+                src.close()
+        # Delete a folder with pathlib
+        shutil.rmtree(dir_path)
+    else:
+        return outname

cubexpress/geotyping.py CHANGED Viewed

@@ -306,13 +306,17 @@ class RequestSet(BaseModel):
                             "crsCode": meta.raster_transform.crs,
                         },
                     },
+                    "cs_cdf": int(meta.id.split("_")[-1]) / 100,
+                    "date": meta.id.split("_")[0],
                     "outname": f"{meta.id}.tif",
                 }
                 for index, meta in enumerate(self.requestset)
             ]
         )
     def _validate_dataframe_schema(self) -> None:
         """
         Checks that the `_dataframe` contains the required columns and that each column
@@ -367,21 +371,7 @@ class RequestSet(BaseModel):
                             f"Column '{col_name}' has an invalid type in row {i}. "
                             f"Expected {expected_type}, got {type(value)}"
                         )
-        # B) Validation of the `manifest` column structure
-        #    - Must contain at least 'assetId' or 'expression'
-        #    - Must contain 'grid' with the minimum required sub-keys
-        #    - Example:
-        #         {
-        #           "fileFormat": "GEO_TIFF",
-        #           "bandIds": [...],
-        #           "grid": {
-        #              "dimensions": {"width": ..., "height": ...},
-        #              "affineTransform": {...},
-        #              "crsCode": ...
-        #           },
-        #           // Either "assetId" or "expression" must be here
-        #         }
         for i, row in self._dataframe.iterrows():
             manifest = row["manifest"]

cubexpress/request.py CHANGED Viewed

@@ -32,7 +32,7 @@ def table_to_requestset(
     """
     df = table.copy()
     if df.empty:
@@ -47,34 +47,57 @@ def table_to_requestset(
     centre_hash = pgh.encode(df.attrs["lat"], df.attrs["lon"], precision=5)
     reqs: list[Request] = []
     if mosaic:
-        # group all asset IDs per day
         grouped = (
-            df.groupby("date")["id"]   # Series con listas de ids por día
-            .apply(list)
+        df.groupby('date')
+            .agg(
+                id_list      = ('id', list),
+                cs_cdf_mean  = ('cs_cdf', lambda x: int(round(x.mean(), 2) * 100))
+            )
         )
-        for day, img_ids in grouped.items():
-            ee_img = ee.ImageCollection(
-                [ee.Image(f"{df.attrs['collection']}/{img}") for img in img_ids]
-            ).mosaic()
-            reqs.append(
-                Request(
-                    id=f"{day}_{centre_hash}",
-                    raster_transform=rt,
-                    image=ee_img,
-                    bands=df.attrs["bands"],
+        for day, row in grouped.iterrows():
+            img_ids   = row["id_list"]
+            cdf  = row["cs_cdf_mean"]
+            if len(img_ids) > 1:
+                ee_img = ee.ImageCollection(
+                    [ee.Image(f"{df.attrs['collection']}/{img}") for img in img_ids]
+                ).mosaic()
+                reqs.append(
+                    Request(
+                        id=f"{day}_{centre_hash}_{cdf}",
+                        raster_transform=rt,
+                        image=ee_img,
+                        bands=df.attrs["bands"],
+                    )
                 )
-            )
-    else:  # one request per asset
+            else:
+                for img_id in img_ids:
+                    tile = img_id.split("_")[-1][1:]
+                    reqs.append(
+                        Request(
+                            id=f"{day}_{centre_hash}_{tile}_{cdf}",
+                            raster_transform=rt,
+                            image=f"{df.attrs['collection']}/{img_id}",
+                            bands=df.attrs["bands"],
+                        )
+                    )
+    else:
         for _, row in df.iterrows():
             img_id = row["id"]
-            day    = row["date"]
+            tile = img_id.split("_")[-1][1:]
+            day = row["date"]
+            cdf = int(round(row["cs_cdf"], 2) * 100)
             reqs.append(
                 Request(
-                    id=f"{day}_{centre_hash}_{img_id}",
+                    id=f"{day}_{centre_hash}_{tile}_{cdf}",
                     raster_transform=rt,
                     image=f"{df.attrs['collection']}/{img_id}",
                     bands=df.attrs["bands"],

{cubexpress-0.1.9.dist-info → cubexpress-0.1.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cubexpress
-Version: 0.1.9
+Version: 0.1.10
 Summary: Efficient processing of cubic Earth-observation (EO) data.
 Home-page: https://github.com/andesdatacube/cubexpress
 License: MIT
@@ -20,6 +20,7 @@ Requires-Dist: earthengine-api (>=1.5.12)
 Requires-Dist: numpy (>=2.0.2)
 Requires-Dist: pandas (>=2.2.2)
 Requires-Dist: pyarrow (>=14.0.0)
+Requires-Dist: pydantic (>=2.11.4)
 Requires-Dist: pygeohash (>=1.2.0)
 Requires-Dist: pyproj (>=3.6.0)
 Requires-Dist: rasterio (>=1.3.9)

cubexpress-0.1.10.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+cubexpress/__init__.py,sha256=RjyAqwiD0rU_Z5tCJTYNGKXZ1ggpfPB51wzhr0KwweY,570
+cubexpress/cache.py,sha256=EZiR2AJfplaLpqMIVFb5piCAgFqHKF1vgLIrutfz8tA,1425
+cubexpress/cloud_utils.py,sha256=Vr2A1SZDKP_2xNiLYgwmWOUX8P8I-pXQrxBETiUDq60,7441
+cubexpress/conversion.py,sha256=JSaMnswY-2n5E4H2zxb-oEOTJ8UPzXfMeSVCremtvTw,2520
+cubexpress/cube.py,sha256=tU0lqhtQUwEiz33yebYIbw-a0R4zmTAei-b_xqMIcWU,3719
+cubexpress/downloader.py,sha256=gHVNCNTwK9qA5MPaEHB_m0wOPprw010qaTVnszwbuUk,4668
+cubexpress/geospatial.py,sha256=ZbsPIgsYQFnNFXUuQ136rJsL4b2Bf91o0Vsswby2dFc,1812
+cubexpress/geotyping.py,sha256=XoSXQuoq5CfzKndM2Pko5KXIP0vxGNm02LOOMbCWkrs,16692
+cubexpress/request.py,sha256=jy5K9MQEurNlwhF0izFmoIh3o7m9bC97fsTT_7C7Gv0,3051
+cubexpress-0.1.10.dist-info/LICENSE,sha256=XjoS-d76b7Cl-VgCWhQk83tNf2dNldKBN8SrImwGc2Q,1072
+cubexpress-0.1.10.dist-info/METADATA,sha256=Tn_XBaLWbO4xbmzYJCM6vnbwRNZ1d1ABZ6uF4G4REYM,9664
+cubexpress-0.1.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+cubexpress-0.1.10.dist-info/RECORD,,

cubexpress-0.1.9.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-cubexpress/__init__.py,sha256=RjyAqwiD0rU_Z5tCJTYNGKXZ1ggpfPB51wzhr0KwweY,570
-cubexpress/cache.py,sha256=EZiR2AJfplaLpqMIVFb5piCAgFqHKF1vgLIrutfz8tA,1425
-cubexpress/cloud_utils.py,sha256=aamTm-PxbPQ4ARwd5faG1a1sjKegbtkd0LxT7wYZJ60,7238
-cubexpress/conversion.py,sha256=JSaMnswY-2n5E4H2zxb-oEOTJ8UPzXfMeSVCremtvTw,2520
-cubexpress/cube.py,sha256=fwD_UdH0oBWSK-2-fMPPm3YKxcw1xxnm2g0vrZuChI8,3172
-cubexpress/downloader.py,sha256=NoJXxCZ7SXBMzUDcXU6DGa2vce61g716FYYfq17pH0k,4461
-cubexpress/geospatial.py,sha256=ZbsPIgsYQFnNFXUuQ136rJsL4b2Bf91o0Vsswby2dFc,1812
-cubexpress/geotyping.py,sha256=XuBcJAgNxvXCCIDmWijI70p6dEFlu6UfbqwQlWXSWQw,17155
-cubexpress/request.py,sha256=ZWVIXo0_rVkX1fBWREbtvvdYUSZPCv4LIcPdrMKKuLs,2270
-cubexpress-0.1.9.dist-info/LICENSE,sha256=XjoS-d76b7Cl-VgCWhQk83tNf2dNldKBN8SrImwGc2Q,1072
-cubexpress-0.1.9.dist-info/METADATA,sha256=qplHASBXni3m6kOAFIw8Jy2fBFqY1QfLDaNM3ou6cMk,9628
-cubexpress-0.1.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-cubexpress-0.1.9.dist-info/RECORD,,

{cubexpress-0.1.9.dist-info → cubexpress-0.1.10.dist-info}/LICENSE RENAMED Viewed

File without changes

{cubexpress-0.1.9.dist-info → cubexpress-0.1.10.dist-info}/WHEEL RENAMED Viewed

File without changes

cubexpress 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl

cubexpress 0.1.9py3-none-any.whl → 0.1.10py3-none-any.whl