PyPI - satcube - Versions diffs - 0.1.17__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

satcube 0.1.17py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of satcube might be problematic. Click here for more details.

Files changed (17) hide show

satcube/__init__.py +2 -4
satcube/align.py +85 -44
satcube/archive_cloud_detection.py +23 -0
satcube/archive_dataclass.py +39 -0
satcube/archive_main.py +453 -0
satcube/archive_utils.py +1087 -0
satcube/{cloud_detection.py → cloud.py} +100 -95
satcube/composite.py +85 -0
satcube/download.py +2 -5
satcube/gapfill.py +216 -0
satcube/objects.py +208 -36
satcube/smooth.py +46 -0
{satcube-0.1.17.dist-info → satcube-0.1.18.dist-info}/METADATA +1 -1
satcube-0.1.18.dist-info/RECORD +17 -0
satcube-0.1.17.dist-info/RECORD +0 -10
{satcube-0.1.17.dist-info → satcube-0.1.18.dist-info}/LICENSE +0 -0
{satcube-0.1.17.dist-info → satcube-0.1.18.dist-info}/WHEEL +0 -0

satcube/{cloud_detection.py → cloud.py} RENAMED Viewed

@@ -24,8 +24,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from tqdm import tqdm
 import rasterio as rio
 from rasterio.merge import merge
-import shutil
 from satcube.utils import define_iteration, DeviceManager
 import warnings
 warnings.filterwarnings(
@@ -35,8 +33,6 @@ warnings.filterwarnings(
     module=r"huggingface_hub\.utils\._.*",
 )
 def infer_cloudmask(
     input_path: str | pathlib.Path,
     output_path: str | pathlib.Path,
@@ -45,38 +41,16 @@ def infer_cloudmask(
     chunk_size: int = 512,
     overlap: int = 32,
     device: str = "cpu",
-    save_mask: bool = False,
-    prefix: str = ""
+    save_mask: bool = True
 ) -> pathlib.Path:
-    """
-    Predict 'image_path' in overlapping patches of 'chunk_size' x 'chunk_size',
-    but only write the valid (inner) region to avoid seam artifacts.
-    This uses partial overlap logic:
-      - For interior tiles, skip overlap//2 on each side.
-      - For boundary tiles, we skip only the interior side to avoid losing data at the edges.
-    Parameters
-    ----------
-    image_path : Path to input image.
-    output_path : Path to output single-band mask.
-    cloud_model : PyTorch model (already loaded with weights).
-    chunk_size : Size of each tile to read from the source image (default 512).
-    overlap : Overlap in pixels between adjacent tiles (default 32).
-    device : "cpu" or "cuda:0".
-    Returns
-    -------
-    pathlib.Path : The path to the created output image.
-    """
-    input_path = pathlib.Path(input_path)
-    output_path = pathlib.Path(output_path)
+    input_path = pathlib.Path(input_path).expanduser().resolve()
+    output_path = pathlib.Path(output_path).expanduser().resolve()
     with rio.open(input_path) as src:
         meta = src.profile
-        if not meta.get("tiled", False):
-            raise ValueError("The input image is not marked as tiled in its metadata.")
+        # if not meta.get("tiled", False):
+        #     raise ValueError("The input image is not marked as tiled in its metadata.")
         # Ensure the internal blocksize matches chunk_size
         if chunk_size % meta["blockxsize"] != 0 and meta["blockxsize"] <= chunk_size:
             raise ValueError(f"Image blocks must be {chunk_size}x{chunk_size}, "
@@ -93,9 +67,21 @@ def infer_cloudmask(
             window = Window(col_off, row_off, chunk_size, chunk_size)
             patch = src.read(window=window) / 1e4
-            patch_tensor = torch.from_numpy(patch).float().unsqueeze(0).to(device)
-            result = cloud_model(patch_tensor).cpu().numpy().astype(np.uint8)
+            patch_tensor = (
+                torch.from_numpy(patch)
+                .float()
+                .unsqueeze(0)
+                .to(device)
+            )
+            result = (
+                cloud_model(patch_tensor)
+                .cpu()
+                .numpy()
+                .astype(np.uint8)
+            )
             if col_off == 0:
                 offset_x = 0
             else:
@@ -132,8 +118,7 @@ def infer_cloudmask(
             output_mask = output_path.parent / (output_path.stem + "_mask.tif")
             with rio.open(output_mask, "w", **out_meta) as dst:
                 dst.write(full_mask, 1)
         data = src.read()
         img_prof = src.profile.copy()
@@ -144,57 +129,62 @@ def infer_cloudmask(
         with rio.open(output_path, "w", **img_prof) as dst:
             dst.write(masked)
-    return output_path
+    flat = full_mask.astype(np.uint8).ravel()
+    counts = np.bincount(flat, minlength=4)
+    total  = flat.size
+    percentages = {
+        "id": input_path.stem,
+        "clear_pct":         counts[0] / total * 100.0,
+        "thin_cloud_pct":    counts[1] / total * 100.0,
+        "cloud_shadow_pct":  counts[2] / total * 100.0,
+        "thick_cloud_pct":   counts[3] / total * 100.0,
+    }
+    return percentages
-def cloud_masking(
-    input: str | pathlib.Path = "raw",
-    output: str | pathlib.Path = "masked",
+def cloud_fn(
+    metadata: pd.DataFrame | None = None,
+    input_dir: str | pathlib.Path | None = None,
+    output_dir: str | pathlib.Path = "masked",
     model_path: str | pathlib.Path = "SEN2CloudEnsemble",
     device: str = "cpu",
-    save_mask: bool = False,
+    save_mask: bool = True,
+    cache: bool = False,
     nworks: int = 4,
-) -> list[pathlib.Path]:
-    """Write cloud-masked Sentinel-2 images.
-    Parameters
-    ----------
-    input
-        Path to a single ``.tif`` file **or** a directory containing them.
-    output
-        Destination directory (created i
-        f missing).
-    tile, pad
-        Tile size and padding (pixels) when tiling is required.
-    save_mask
-        If *True*, store the binary mask alongside the masked image.
-    device
-        Torch device for inference, e.g. ``"cpu"`` or ``"cuda:0"``.
-    max_pix_cpu
-        Tile images larger than this when running on CPU.
-    Returns
-    ------
-    list[pathlib.Path]
-        Paths to the generated masked images.
-    """
-    src = pathlib.Path(input).expanduser().resolve()
-    dst_dir = pathlib.Path(output).expanduser().resolve()
-    dst_dir.mkdir(parents=True, exist_ok=True)
-    # Collect files to process -------------------------------------------------
-    tif_paths = []
-    if src.is_dir():
-        tif_paths = [p for p in src.rglob("*.tif")]
-    elif src.is_file() and src.suffix.lower() == ".tif":
-        tif_paths = [src]
-        src = src.parent  # for relative-path bookkeeping below
-    else:
-        raise ValueError(f"Input must be a .tif or directory, got: {src}")
-    if not tif_paths:
-        print(f"[cloud_masking] No .tif files found in {src}")
-        return []
+) -> pd.DataFrame | None:
+    input_dir = pathlib.Path(input_dir).expanduser().resolve()
+    output_dir = pathlib.Path(output_dir).expanduser().resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if metadata is None:
+        if not input_dir:
+            raise ValueError("Input directory must be specified.")
+        else:
+            if input_dir.is_dir():
+                tif_paths = [p for p in input_dir.rglob("*.tif")]
+                df = pd.DataFrame({
+                    "id": [p.stem for p in tif_paths],
+                    "path": [str(p) for p in tif_paths]
+                })
+            elif input_dir.is_file() and input_dir.suffix.lower() == ".tif":
+                tif_paths = [input_dir]
+                input_dir = input_dir.parent
+            else:
+                raise ValueError(f"Input must be a .tif or directory, got: {input_dir}")
+    else:
+        if not input_dir:
+            raise ValueError("Input directory must be specified.")
+        else:
+            df = metadata["id"].to_frame()
+            df["path"] = df["id"].apply(lambda x: str(input_dir / (x + ".tif")))
+    if cache:
+        exist_files = [file.stem for file in output_dir.glob("*.tif")]
+        df = df[~df["id"].isin(exist_files)]
     if not pathlib.Path(model_path).exists():
         mlstac.download(
             file = "https://huggingface.co/tacofoundation/CloudSEN12-models/resolve/main/SEN2CloudEnsemble/mlm.json",
@@ -202,22 +192,23 @@ def cloud_masking(
         )
     model = mlstac.load(model_path)
-    cloud_model = DeviceManager(model, init_device=device).model
-    cloud_model.eval()
+    cloud_model = DeviceManager(model, init_device=device).model.eval()
+    results_cloud = []
     with ThreadPoolExecutor(max_workers=nworks) as executor:
         futures = {
             executor.submit(
                 infer_cloudmask,
-                input_path=p,
-                output_path=dst_dir / p.name,
+                input_path=p["path"],
+                output_path=output_dir / (p["id"] + ".tif"),
                 cloud_model=cloud_model,
                 device=device,
-                save_mask=save_mask,
-                prefix=f"[{i+1}/{len(tif_paths)}] "
-            ): p for i, p in enumerate(tif_paths)
+                save_mask=save_mask
+            ): p for i, p in df.iterrows()
         }
         for future in tqdm(
             as_completed(futures),
             total=len(futures),
@@ -228,11 +219,25 @@ def cloud_masking(
             p = futures[future]
             try:
                 result = future.result()
-                print(f"{result} processed successfully.")
+                results_cloud.append(result)
             except Exception as e:
                 print(f"Error processing {p}: {e}")
-    metadata = src / "metadata.csv"
-    if metadata.exists():
-        metadata_dst = dst_dir / "metadata.csv"
-        shutil.copy(metadata, metadata_dst)
+    cloud_df = pd.DataFrame(results_cloud)
+    if cloud_df.empty:
+        return metadata
+    metadata = metadata.drop(
+        columns=["clear_pct","thin_cloud_pct", "cloud_shadow_pct", "thick_cloud_pct"],
+        errors="ignore"
+    )
+    metadata = metadata.merge(
+        cloud_df,
+        on="id",
+        how="left",
+        suffixes=('', '')
+    )
+    return metadata

satcube/composite.py ADDED Viewed

@@ -0,0 +1,85 @@
+import pathlib
+from typing import Tuple
+import numpy as np
+import pandas as pd
+import rasterio as rio
+from concurrent.futures import ProcessPoolExecutor, as_completed
+def monthly_composites_s2(
+    metadata: pd.DataFrame | None = None,
+    input_dir: str | pathlib.Path | None = None,
+    output_dir: str | pathlib.Path = "monthly_composites",
+    date_range: Tuple[str, str] = ("2018-06-01", "2020-01-01"),
+    agg_method: str = "median",
+):
+    input_dir = pathlib.Path(input_dir).expanduser().resolve()
+    output_dir = pathlib.Path(output_dir).expanduser().resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    all_raw_files = [input_dir / f for f in input_dir.glob("*.tif") if f.is_file()]
+    with rio.open(all_raw_files[0]) as src:
+        profile = src.profile
+    all_raw_dates = pd.to_datetime(metadata["date"])
+    all_raw_date_min = pd.to_datetime(date_range[0])
+    all_raw_date_max = pd.to_datetime(date_range[1])
+    all_raw_dates_unique = pd.date_range(
+        all_raw_date_min, all_raw_date_max, freq="MS"
+    ) + pd.DateOffset(days=14)
+    all_raw_dates_unique = all_raw_dates_unique.strftime("%Y-%m-15")
+    # Aggregate the data considering the method and dates
+    new_table = []
+    for idx, date in enumerate(all_raw_dates_unique):
+        # Get the images to aggregate
+        idxs = all_raw_dates.dt.strftime("%Y-%m-15") == date
+        images = [all_raw_files[i] for i in np.where(idxs)[0]]
+        if len(images) == 0:
+            data = np.ones((profile["count"], profile["height"], profile["width"]))
+            data = 65535 * data
+            nodata = 1
+            profile_image = profile
+        else:
+            # Read the images
+            container = []
+            for image in images:
+                with rio.open(image) as src:
+                    data = src.read()
+                    profile_image = src.profile
+                container.append(data)
+            # Aggregate the data
+            if agg_method == "mean":
+                data = np.mean(container, axis=0)
+            elif agg_method == "median":
+                data = np.median(container, axis=0)
+            elif agg_method == "max":
+                data = np.max(container, axis=0)
+            elif agg_method == "min":
+                data = np.min(container, axis=0)
+            else:
+                raise ValueError("Invalid aggregation method")
+            nodata = 0
+        # Save the image
+        with rio.open(output_dir / f"{date}.tif", "w", **profile_image) as dst:
+            dst.write(data.astype(rio.uint16))
+        meta_dict = {
+            "outname": f"{date}.tif",
+            "date": date,
+            "nodata": nodata,
+        }
+        new_table.append(meta_dict)
+    return pd.DataFrame(new_table)

satcube/download.py CHANGED Viewed

@@ -17,7 +17,6 @@ def download(
     nworks: int = 4
 ) -> "SatCubeMetadata":
     outfolder = pathlib.Path(outfolder).resolve()
     table = ce.s2_table(
@@ -57,12 +56,10 @@ def download(
         .reset_index()
     )
-    table_final = table_req.merge(
+    df = table_req.merge(
         result_table,
         on='date',
         how='left'
     ).rename(columns={'id_x': 'id', 'id_y': 'gee_ids'})
-    table_final.to_csv(outfolder / "metadata.csv", index=False)
-    return SatCubeMetadata(df=table_final, raw_dir=outfolder)
+    return SatCubeMetadata(df=df, raw_dir=outfolder)

satcube/gapfill.py ADDED Viewed

@@ -0,0 +1,216 @@
+# satcube/gapfill.py
+from __future__ import annotations
+import pathlib, shutil
+from typing import Literal, List, Tuple
+import numpy as np
+import pandas as pd
+import rasterio as rio
+from tqdm import tqdm
+from sklearn.linear_model import LinearRegression
+_GAP_METHOD = Literal["histogram_matching", "linear"]
+def linear_interpolation(
+    image1: np.ndarray, image2: np.ndarray, image3: np.ndarray
+) -> np.ndarray:
+    """Apply linear interpolation to image3 using image1 and image2 as
+    reference images.
+    Args:
+        image1 (np.ndarray): The first reference image.
+        image2 (np.ndarray): The second reference image.
+        image3 (np.ndarray): The image to be matched.
+    Returns:
+        np.ndarray: The matched image.
+    """
+    # remove nan values
+    image1_nonan = image1.flatten().copy()
+    image1_nonan = image1_nonan[~np.isnan(image1_nonan)]
+    image2_nonan = image2.flatten().copy()
+    image2_nonan = image2_nonan[~np.isnan(image2_nonan)]
+    # Calculate the slope and intercept
+    linreg = LinearRegression()
+    linreg.fit(image2_nonan[:, np.newaxis], image1_nonan[:, np.newaxis])
+    slope = linreg.coef_[0]
+    intercept = linreg.intercept_
+    # Apply the linear interpolation
+    image3_matched = slope * image3 + intercept
+    return image3_matched
+def tripple_histogram_matching(
+    image1: np.ndarray, image2: np.ndarray, image3: np.ndarray
+) -> np.ndarray:
+    """Apply histogram matching to image3 using image1 and image2 as reference images.
+    Args:
+        image1 (np.ndarray): The first reference image.
+        image2 (np.ndarray): The second reference image.
+        image3 (np.ndarray): The image to be matched.
+    Returns:
+        np.ndarray: The matched image.
+    """
+    # remove nan values
+    image1_nonan = image1.flatten().copy()
+    image1_nonan = image1_nonan[~np.isnan(image1_nonan)]
+    image2_nonan = image2.flatten().copy()
+    image2_nonan = image2_nonan[~np.isnan(image2_nonan)]
+    image3_nonan = image3.flatten().copy()
+    image3_nonan = image3_nonan[~np.isnan(image3_nonan)]
+    # Calculate histograms
+    hist1, bins = np.histogram(image1_nonan, 128, [0, 2])
+    hist2, bins = np.histogram(image2_nonan, 128, [0, 2])
+    hist3, bins = np.histogram(image3_nonan, 128, [0, 2])
+    # Calculate the cumulative distribution function (CDF) of img1
+    cdf1 = hist1.cumsum() / hist1.sum()
+    # Calculate the CDF of img2
+    cdf2 = hist2.cumsum() / hist2.sum()
+    # Create a lookup table (LUT) to map the pixel values of img1 to img2
+    lut = np.interp(cdf2, cdf1, bins[:-1])
+    # Perform histogram matching
+    img3_matched = np.interp(image3.ravel(), bins[:-1], lut).reshape(image3.shape)
+    return img3_matched
+def _fill_one(
+    img_path: pathlib.Path,
+    ref_paths: List[pathlib.Path],
+    dates: np.ndarray,
+    this_date: np.datetime64,
+    *,
+    method: _GAP_METHOD,
+    out_dir: pathlib.Path,
+    quiet: bool
+) -> float:
+    """Gap‑fill a single S2 scene; return error metric."""
+    with rio.open(img_path) as src:
+        data = src.read() / 1e4
+        prof = src.profile
+        data[data == 6.5535] = np.nan
+        cloudmask = np.isnan(data).mean(0)
+    if cloudmask.sum() == 0:              # imagen limpia: copia sin procesar
+        shutil.copy(img_path, out_dir / img_path.name)
+        return 0.0
+    # ordenar todas las demás por cercanía temporal
+    idxs = np.argsort(np.abs(dates - this_date))
+    best_img, best_metric = None, np.inf
+    tries = 0
+    for i in idxs:
+        if tries == 5:                    # máximo 5 intentos
+            break
+        ref_path = ref_paths[i]
+        if ref_path == img_path:
+            continue
+        with rio.open(ref_path) as src:
+            ref = src.read() / 1e4
+            ref[ref == 6.5535] = np.nan
+            ref_mask = np.isnan(ref) * 1.0
+        # descartar ref con nubes superpuestas
+        if np.sum((ref_mask + cloudmask) == 2) != 0:
+            continue
+        full_mask = ((cloudmask + ref_mask) > 0).astype(float)
+        data_masked = np.where(full_mask, np.nan, data)
+        ref_masked  = np.where(full_mask, np.nan, ref)
+        filled = np.zeros_like(data)
+        for b in range(data.shape[0]):
+            if method == "histogram_matching":
+                filled[b] = tripple_histogram_matching(data_masked[b], ref_masked[b], ref[b])
+            else:                         # "linear"
+                filled[b] = linear_interpolation(data_masked[b], ref_masked[b], ref[b])
+        # calcular métrica
+        a = filled[[2, 1, 0]].mean(0)
+        b = data[[2, 1, 0]].mean(0)
+        metric = np.nanmean(np.abs(a - b) / (a + b))
+        if metric < best_metric:
+            best_metric = metric
+            best_img    = filled
+        tries += 1
+    if best_img is None:                 # no suitable ref found
+        if not quiet:
+            print(f"{img_path.name}: no cloud‑free neighbour found – copied.")
+        shutil.copy(img_path, out_dir / img_path.name)
+        return np.nan                    # could also return 0.0
+    # Ensure float dtype for isnan()
+    if best_img.dtype.kind in "iu":      # i = int, u = uint
+        best_img = best_img.astype(np.float32)
+    # Combine and save
+    best_img[np.isnan(best_img)] = 0
+    data[np.isnan(data)]         = 0
+    final = data + best_img * full_mask
+    final[final < 0] = 0
+    final = (final * 1e4).astype(np.uint16)
+    with rio.open(out_dir / img_path.name, "w", **prof) as dst:
+        dst.write(final)
+    if not quiet:
+        print(f"{img_path.name} gap‑filled (error={best_metric:.4f})")
+    return float(best_metric)
+def gapfill_fn(                                   # ← wrapper estilo align_fn
+    metadata: pd.DataFrame,
+    input_dir: str | pathlib.Path,
+    output_dir: str | pathlib.Path = "gapfilled",
+    *,
+    method: _GAP_METHOD = "histogram_matching",
+    quiet: bool = False
+) -> pd.DataFrame:
+    """Gap‑fill every image listed in *metadata*.
+    Returns
+    -------
+    pd.DataFrame
+        Original dataframe + column ``match_error``.
+    """
+    input_dir  = pathlib.Path(input_dir).expanduser().resolve()
+    output_dir = pathlib.Path(output_dir).expanduser().resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    img_paths  = [input_dir / f"{i}.tif" for i in metadata["id"]]
+    dates      = pd.to_datetime(metadata["date"]).to_numpy()
+    errors: List[float] = []
+    for i, img in enumerate(tqdm(img_paths, desc="Gap‑filling", unit="img")):
+        err = _fill_one(img, img_paths, dates, dates[i],
+                        method=method, out_dir=output_dir, quiet=quiet)
+        errors.append(err)
+    metadata = metadata.drop(columns=["match_error"], errors="ignore")
+    metadata["match_error"] = errors
+    return metadata

satcube 0.1.17__py3-none-any.whl → 0.1.18__py3-none-any.whl

Potentially problematic release.

satcube 0.1.17py3-none-any.whl → 0.1.18py3-none-any.whl