satcube 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of satcube might be problematic. Click here for more details.

satcube/__init__.py CHANGED
@@ -1,10 +1,9 @@
1
1
  from satcube.cloud_detection import cloud_masking
2
- from satcube.download import download_data
3
-
4
-
5
-
6
- __all__ = ["cloud_masking", "download_data"]
7
-
2
+ from satcube.download import download
3
+ from satcube.align import align
8
4
  import importlib.metadata
9
- __version__ = importlib.metadata.version("satcube")
5
+ from satcube.objects import SatCubeMetadata
6
+
7
+ __all__ = ["cloud_masking", "download", "align", "SatCubeMetadata"]
8
+ # __version__ = importlib.metadata.version("satcube")
10
9
 
satcube/align.py ADDED
@@ -0,0 +1,98 @@
1
+ from __future__ import annotations
2
+
3
+ import pathlib
4
+ from typing import List, Tuple
5
+ import pickle
6
+ import pandas as pd
7
+ import satalign
8
+ import shutil
9
+
10
+ import numpy as np
11
+ import rasterio as rio
12
+ import xarray as xr
13
+ from affine import Affine
14
+ from concurrent.futures import ThreadPoolExecutor, as_completed
15
+ from tqdm import tqdm
16
+
17
+
18
+ def process_row(row: pd.Series, reference: np.ndarray, input_dir: pathlib.Path, output_dir: pathlib.Path) -> None:
19
+ row_path = input_dir / (row["id"] + ".tif")
20
+ output_path = output_dir / (row["id"] + ".tif")
21
+ with rio.open(row_path) as src:
22
+ row_image = src.read()
23
+ profile_image = src.profile
24
+
25
+ row_image_float = row_image.astype(np.float32) / 10000
26
+ row_image_float = row_image_float[np.newaxis, ...]
27
+
28
+ pcc_model = satalign.LGM(
29
+ datacube = row_image_float,
30
+ reference = reference
31
+ )
32
+ image, _ = pcc_model.run_multicore()
33
+ image = (image * 10000).astype(np.uint16).squeeze()
34
+
35
+ with rio.open(output_path, "w", **profile_image) as dst:
36
+ dst.write(image)
37
+
38
+ def align(
39
+ input_dir: str | pathlib.Path = "raw",
40
+ output_dir: str | pathlib.Path = "aligned",
41
+ nworks: int = 4,
42
+ cache: bool = False
43
+ ) -> None:
44
+
45
+ input_dir = pathlib.Path(input_dir).expanduser().resolve()
46
+ output_dir = pathlib.Path(output_dir).expanduser().resolve()
47
+ output_dir.mkdir(parents=True, exist_ok=True)
48
+
49
+ metadata_path = input_dir / "metadata.csv"
50
+
51
+ if not metadata_path.exists():
52
+ raise FileNotFoundError(
53
+ f"Metadata file not found: {metadata_path}. "
54
+ "Please run the download step first."
55
+ )
56
+ else:
57
+ metadata = pd.read_csv(metadata_path)
58
+
59
+ if cache:
60
+ exist_files = [file.stem for file in output_dir.glob("*.tif")]
61
+ metadata = metadata[~metadata["id"].isin(exist_files)]
62
+
63
+ if metadata.empty:
64
+ return
65
+
66
+ id_reference = metadata.sort_values(
67
+ by=["cs_cdf", "date"],
68
+ ascending=False,
69
+ ).iloc[0]["id"]
70
+
71
+ reference_path = input_dir / (id_reference + ".tif")
72
+
73
+ with rio.open(reference_path) as ref_src:
74
+ reference = ref_src.read()
75
+
76
+ reference_float = reference.astype(np.float32) / 10000
77
+
78
+ with ThreadPoolExecutor(max_workers=nworks) as executor:
79
+ futures = {
80
+ executor.submit(process_row, row, reference_float, input_dir, output_dir)
81
+ for _, row in metadata.iterrows()
82
+ }
83
+ for future in tqdm(
84
+ as_completed(futures),
85
+ total=len(futures),
86
+ desc="Aligning images",
87
+ unit="image",
88
+ leave=True
89
+ ):
90
+ try:
91
+ future.result()
92
+ except Exception as e:
93
+ print(f"Error processing image: {e}")
94
+
95
+ metadata = input_dir / "metadata.csv"
96
+ if metadata.exists():
97
+ metadata_dst = output_dir / "metadata.csv"
98
+ shutil.copy(metadata, metadata_dst)
@@ -12,36 +12,148 @@ Example
12
12
 
13
13
  from __future__ import annotations
14
14
 
15
- import time
16
- from pathlib import Path
17
- from typing import List
15
+ import pathlib
18
16
 
19
17
  import mlstac
20
18
  import numpy as np
21
19
  import rasterio as rio
20
+ from rasterio.windows import Window
22
21
  import torch
22
+ import pandas as pd
23
+ from concurrent.futures import ThreadPoolExecutor, as_completed
24
+ from tqdm import tqdm
25
+ import rasterio as rio
26
+ from rasterio.merge import merge
27
+ import shutil
23
28
 
24
- from satcube.utils import DeviceManager, _reset_gpu
25
- import warnings, re
26
-
27
-
29
+ from satcube.utils import define_iteration, DeviceManager
30
+ import warnings
28
31
  warnings.filterwarnings(
29
32
  "ignore",
30
- message=re.escape("The secret `HF_TOKEN` does not exist in your Colab secrets."),
33
+ message="The secret HF_TOKEN does not exist in your Colab secrets.",
31
34
  category=UserWarning,
32
- module="huggingface_hub.utils._auth",
35
+ module=r"huggingface_hub\.utils\._.*",
33
36
  )
34
37
 
35
- def cloud_masking(
36
- input: str | Path, # noqa: A002 (shadowing built-in is OK here)
37
- output: str | Path,
38
+
39
+
40
+ def infer_cloudmask(
41
+ input_path: str | pathlib.Path,
42
+ output_path: str | pathlib.Path,
43
+ cloud_model: torch.nn.Module,
38
44
  *,
39
- tile: int = 512,
40
- pad: int = 64,
45
+ chunk_size: int = 512,
46
+ overlap: int = 32,
47
+ device: str = "cpu",
41
48
  save_mask: bool = False,
49
+ prefix: str = ""
50
+ ) -> pathlib.Path:
51
+ """
52
+ Predict 'image_path' in overlapping patches of 'chunk_size' x 'chunk_size',
53
+ but only write the valid (inner) region to avoid seam artifacts.
54
+
55
+ This uses partial overlap logic:
56
+ - For interior tiles, skip overlap//2 on each side.
57
+ - For boundary tiles, we skip only the interior side to avoid losing data at the edges.
58
+
59
+ Parameters
60
+ ----------
61
+ image_path : Path to input image.
62
+ output_path : Path to output single-band mask.
63
+ cloud_model : PyTorch model (already loaded with weights).
64
+ chunk_size : Size of each tile to read from the source image (default 512).
65
+ overlap : Overlap in pixels between adjacent tiles (default 32).
66
+ device : "cpu" or "cuda:0".
67
+
68
+ Returns
69
+ -------
70
+ pathlib.Path : The path to the created output image.
71
+ """
72
+
73
+ input_path = pathlib.Path(input_path)
74
+ output_path = pathlib.Path(output_path)
75
+
76
+ with rio.open(input_path) as src:
77
+ meta = src.profile
78
+ if not meta.get("tiled", False):
79
+ raise ValueError("The input image is not marked as tiled in its metadata.")
80
+ # Ensure the internal blocksize matches chunk_size
81
+ if chunk_size % meta["blockxsize"] != 0 and meta["blockxsize"] <= chunk_size:
82
+ raise ValueError(f"Image blocks must be {chunk_size}x{chunk_size}, "
83
+ f"got {meta['blockxsize']}x{meta['blockysize']}")
84
+ height, width = meta["height"], meta["width"]
85
+
86
+ full_mask = np.zeros((height, width), dtype=np.float32)
87
+
88
+ coords = define_iteration((height, width), chunk_size, overlap)
89
+
90
+ with rio.open(input_path) as src:
91
+
92
+ for (row_off, col_off) in coords:
93
+
94
+ window = Window(col_off, row_off, chunk_size, chunk_size)
95
+ patch = src.read(window=window) / 1e4
96
+ patch_tensor = torch.from_numpy(patch).float().unsqueeze(0).to(device)
97
+ result = cloud_model(patch_tensor).cpu().numpy().astype(np.uint8)
98
+
99
+ if col_off == 0:
100
+ offset_x = 0
101
+ else:
102
+ offset_x = col_off + overlap // 2
103
+ if row_off == 0:
104
+ offset_y = 0
105
+ else:
106
+ offset_y = row_off + overlap // 2
107
+ if (offset_x + chunk_size) == width:
108
+ length_x = chunk_size
109
+ sub_x_start = 0
110
+ else:
111
+ length_x = chunk_size - (overlap // 2)
112
+ sub_x_start = overlap // 2 if col_off != 0 else 0
113
+
114
+ if (offset_y + chunk_size) == height:
115
+ length_y = chunk_size
116
+ sub_y_start = 0
117
+ else:
118
+ length_y = chunk_size - (overlap // 2)
119
+ sub_y_start = overlap // 2 if row_off != 0 else 0
120
+
121
+ full_mask[
122
+ offset_y : offset_y + length_y,
123
+ offset_x : offset_x + length_x
124
+ ] = result[
125
+ sub_y_start : sub_y_start + length_y,
126
+ sub_x_start : sub_x_start + length_x
127
+ ]
128
+
129
+ if save_mask:
130
+ out_meta = meta.copy()
131
+ out_meta.update(count=1, dtype="uint8", nodata=255)
132
+ output_mask = output_path.parent / (output_path.stem + "_mask.tif")
133
+ with rio.open(output_mask, "w", **out_meta) as dst:
134
+ dst.write(full_mask, 1)
135
+
136
+
137
+ data = src.read()
138
+ img_prof = src.profile.copy()
139
+
140
+ masked = data.copy()
141
+ masked[:, full_mask != 0] = 65535
142
+ img_prof.update(dtype="uint16", nodata=65535)
143
+
144
+ with rio.open(output_path, "w", **img_prof) as dst:
145
+ dst.write(masked)
146
+
147
+ return output_path
148
+
149
+ def cloud_masking(
150
+ input: str | pathlib.Path = "raw",
151
+ output: str | pathlib.Path = "masked",
152
+ model_path: str | pathlib.Path = "SEN2CloudEnsemble",
42
153
  device: str = "cpu",
43
- max_pix_cpu: float = 7.0e7
44
- ) -> List[Path]:
154
+ save_mask: bool = False,
155
+ nworks: int = 4,
156
+ ) -> list[pathlib.Path]:
45
157
  """Write cloud-masked Sentinel-2 images.
46
158
 
47
159
  Parameters
@@ -49,7 +161,8 @@ def cloud_masking(
49
161
  input
50
162
  Path to a single ``.tif`` file **or** a directory containing them.
51
163
  output
52
- Destination directory (created if missing).
164
+ Destination directory (created i
165
+ f missing).
53
166
  tile, pad
54
167
  Tile size and padding (pixels) when tiling is required.
55
168
  save_mask
@@ -64,14 +177,12 @@ def cloud_masking(
64
177
  list[pathlib.Path]
65
178
  Paths to the generated masked images.
66
179
  """
67
- t_start = time.perf_counter()
68
-
69
- src = Path(input).expanduser().resolve()
70
- dst_dir = Path(output).expanduser().resolve()
180
+ src = pathlib.Path(input).expanduser().resolve()
181
+ dst_dir = pathlib.Path(output).expanduser().resolve()
71
182
  dst_dir.mkdir(parents=True, exist_ok=True)
72
183
 
73
184
  # Collect files to process -------------------------------------------------
74
- tif_paths: list[Path]
185
+ tif_paths = []
75
186
  if src.is_dir():
76
187
  tif_paths = [p for p in src.rglob("*.tif")]
77
188
  elif src.is_file() and src.suffix.lower() == ".tif":
@@ -84,95 +195,44 @@ def cloud_masking(
84
195
  print(f"[cloud_masking] No .tif files found in {src}")
85
196
  return []
86
197
 
87
- dir = Path("SEN2CloudEnsemble")
88
-
89
- if not dir.exists():
90
-
198
+ if not pathlib.Path(model_path).exists():
91
199
  mlstac.download(
92
200
  file = "https://huggingface.co/tacofoundation/CloudSEN12-models/resolve/main/SEN2CloudEnsemble/mlm.json",
93
- output_dir = "SEN2CloudEnsemble",
201
+ output_dir = model_path
94
202
  )
95
203
 
96
- experiment = mlstac.load(dir.as_posix())
97
-
98
- dm = DeviceManager(experiment, init_device=device)
99
-
100
- masked_paths: list[Path] = []
101
-
102
- # -------------------------------------------------------------------------
103
- for idx, tif_path in enumerate(tif_paths, 1):
104
- rel = tif_path.relative_to(src)
105
- out_dir = dst_dir / rel.parent
106
- out_dir.mkdir(parents=True, exist_ok=True)
107
-
108
- mask_path = out_dir / f"{tif_path.stem}_cloudmask.tif"
109
- masked_path = out_dir / f"{tif_path.stem}_masked.tif"
110
-
111
- with rio.open(tif_path) as src_img:
112
- profile = src_img.profile
113
- h, w = src_img.height, src_img.width
114
-
115
- mask_prof = profile.copy()
116
- mask_prof.update(driver="GTiff", count=1, dtype="uint8", nodata=255)
117
-
118
- do_tiling = (dm.device == "cuda") or (h * w > max_pix_cpu)
119
- full_mask = np.full((h, w), 255, np.uint8)
120
-
121
- t0 = time.perf_counter()
122
-
123
- # ----------------------- inference -----------------------------------
124
- if not do_tiling: # full frame
125
- with rio.open(tif_path) as src_img, torch.inference_mode():
126
- img = src_img.read().astype(np.float32) / 1e4
127
- h32, w32 = (h + 31) // 32 * 32, (w + 31) // 32 * 32
128
- pad_b, pad_r = h32 - h, w32 - w
129
- tensor = torch.from_numpy(img).unsqueeze(0)
130
- if pad_b or pad_r:
131
- tensor = torch.nn.functional.pad(tensor, (0, pad_r, 0, pad_b))
132
- mask = dm.model(tensor.to(dm.device)).squeeze(0)
133
- full_mask[:] = mask[..., :h, :w].cpu().numpy().astype(np.uint8)
134
- else: # tiled
135
- with rio.open(tif_path) as src_img, torch.inference_mode():
136
- for y0 in range(0, h, tile):
137
- for x0 in range(0, w, tile):
138
- y0r, x0r = max(0, y0 - pad), max(0, x0 - pad)
139
- y1r, x1r = min(h, y0 + tile + pad), min(w, x0 + tile + pad)
140
- win = rio.windows.Window(x0r, y0r, x1r - x0r, y1r - y0r)
141
-
142
- patch = src_img.read(window=win).astype(np.float32) / 1e4
143
- tensor = torch.from_numpy(patch).unsqueeze(0).to(dm.device)
144
- mask = dm.model(tensor).squeeze(0).cpu().numpy().astype(np.uint8)
145
-
146
- y_in0 = pad if y0r else 0
147
- x_in0 = pad if x0r else 0
148
- y_in1 = mask.shape[0] - (pad if y1r < h else 0)
149
- x_in1 = mask.shape[1] - (pad if x1r < w else 0)
150
- core = mask[y_in0:y_in1, x_in0:x_in1]
151
- full_mask[y0 : y0 + core.shape[0], x0 : x0 + core.shape[1]] = core
152
-
153
- # ----------------------- output --------------------------------------
154
- if save_mask:
155
- with rio.open(mask_path, "w", **mask_prof) as dst:
156
- dst.write(full_mask, 1)
157
-
158
- with rio.open(tif_path) as src_img:
159
- data = src_img.read()
160
- img_prof = src_img.profile.copy()
161
-
162
- masked = data.copy()
163
- masked[:, full_mask != 0] = 65535
164
- img_prof.update(dtype="uint16", nodata=65535)
165
-
166
- with rio.open(masked_path, "w", **img_prof) as dst:
167
- dst.write(masked)
168
-
169
- masked_paths.append(masked_path)
170
- dt = time.perf_counter() - t0
171
- print(f"[{idx}/{len(tif_paths)}] {rel} → done in {dt:.1f}s")
172
-
173
- if dm.device == "cuda":
174
- _reset_gpu()
175
-
176
- total_time = time.perf_counter() - t_start
177
- print(f"Processed {len(masked_paths)} image(s) in {total_time:.1f}s.")
178
- return masked_paths
204
+ model = mlstac.load(model_path)
205
+ cloud_model = DeviceManager(model, init_device=device).model
206
+ cloud_model.eval()
207
+
208
+ with ThreadPoolExecutor(max_workers=nworks) as executor:
209
+ futures = {
210
+ executor.submit(
211
+ infer_cloudmask,
212
+ input_path=p,
213
+ output_path=dst_dir / p.name,
214
+ cloud_model=cloud_model,
215
+ device=device,
216
+ save_mask=save_mask,
217
+ prefix=f"[{i+1}/{len(tif_paths)}] "
218
+ ): p for i, p in enumerate(tif_paths)
219
+ }
220
+
221
+ for future in tqdm(
222
+ as_completed(futures),
223
+ total=len(futures),
224
+ desc="Cloud Masking",
225
+ position=0,
226
+ leave=True
227
+ ):
228
+ p = futures[future]
229
+ try:
230
+ result = future.result()
231
+ print(f"{result} processed successfully.")
232
+ except Exception as e:
233
+ print(f"Error processing {p}: {e}")
234
+
235
+ metadata = src / "metadata.csv"
236
+ if metadata.exists():
237
+ metadata_dst = dst_dir / "metadata.csv"
238
+ shutil.copy(metadata, metadata_dst)
satcube/download.py CHANGED
@@ -1,57 +1,68 @@
1
- import pathlib
2
- import ee
3
- import cubexpress
1
+ import sys, time, threading, itertools
2
+ import cubexpress as ce
4
3
  import pandas as pd
4
+ from satcube.objects import SatCubeMetadata
5
+ import pathlib
5
6
 
6
-
7
- def download_data(
8
- *, # keyword-only
7
+ def download(
9
8
  lon: float,
10
9
  lat: float,
11
- cloud_max: int = 40,
12
- edge_size: int = 2_048,
10
+ edge_size: int,
13
11
  start: str,
14
12
  end: str,
15
- output: str = "raw",
16
- scale: int = 10,
17
- nworks: int = 4,
18
- mosaic: bool = True
19
- ) -> pd.DataFrame:
20
- """
21
- Download a Sentinel cube for (lon, lat) and return its metadata.
13
+ *,
14
+ max_cscore: float = 1,
15
+ min_cscore: float = 0,
16
+ outfolder: str = "raw",
17
+ nworks: int = 4
18
+ ) -> "SatCubeMetadata":
19
+
20
+
21
+ outfolder = pathlib.Path(outfolder).resolve()
22
22
 
23
- Parameters
24
- ----------
25
- lon, lat Center point in degrees.
26
- cloud_max Max cloud cover (%).
27
- edge_size Square side length (m).
28
- start, end YYYY-MM-DD date range.
29
- output Folder for GeoTIFFs.
30
- scale Pixel size (m).
31
- nworks Parallel workers.
32
- mosaic Merge scenes per date.
33
- auto_init_gee Call ee.Initialize() if needed.
34
-
35
- Returns
36
- -------
37
- pandas.DataFrame
38
- Scene catalogue used for the request.
39
- """
40
- # Filter scenes
41
- df = cubexpress.cloud_table(
23
+ table = ce.s2_table(
42
24
  lon=lon,
43
25
  lat=lat,
44
26
  edge_size=edge_size,
45
- scale=scale,
46
- cloud_max=cloud_max,
47
27
  start=start,
48
28
  end=end,
29
+ max_cscore=max_cscore,
30
+ min_cscore=min_cscore
31
+ )
32
+
33
+ requests = ce.table_to_requestset(
34
+ table=table,
35
+ mosaic=True
36
+ )
37
+
38
+ ce.get_cube(
39
+ requests=requests,
40
+ outfolder=outfolder,
41
+ nworks=nworks
42
+ )
43
+
44
+ table_req = (
45
+ requests._dataframe.copy()
46
+ .drop(columns=['geotransform', 'manifest', 'outname', 'width', 'height', 'scale_x', 'scale_y'])
47
+ )
48
+
49
+ table_req['date'] = table_req['id'].str.split('_').str[0]
50
+
51
+ result_table = (
52
+ table.groupby('date')
53
+ .agg(
54
+ id=('id', lambda x: '-'.join(x)),
55
+ cs_cdf=('cs_cdf', 'first')
56
+ )
57
+ .reset_index()
49
58
  )
59
+
60
+ table_final = table_req.merge(
61
+ result_table,
62
+ on='date',
63
+ how='left'
64
+ ).rename(columns={'id_x': 'id', 'id_y': 'gee_ids'})
50
65
 
51
- # Build requests + ensure dir
52
- requests = cubexpress.table_to_requestset(df, mosaic=mosaic)
53
- pathlib.Path(output).mkdir(parents=True, exist_ok=True)
66
+ table_final.to_csv(outfolder / "metadata.csv", index=False)
54
67
 
55
- # Download cube
56
- cubexpress.get_cube(requests, output, nworks)
57
- return df
68
+ return SatCubeMetadata(df=table_final, raw_dir=outfolder)
satcube/objects.py ADDED
@@ -0,0 +1,71 @@
1
+ # satcube/objects.py
2
+ from __future__ import annotations
3
+ from dataclasses import dataclass, field
4
+ import pathlib
5
+ import pandas as pd
6
+
7
+ from satcube.align import align as _align_fn
8
+ from satcube.cloud_detection import cloud_masking as _cloud_fn
9
+
10
+ @dataclass
11
+ class SatCubeMetadata:
12
+ df: pd.DataFrame = field(repr=False)
13
+ raw_dir: pathlib.Path = field(repr=False)
14
+
15
+ def __repr__(self) -> str:
16
+ return self.df.__repr__()
17
+
18
+ __str__ = __repr__
19
+
20
+ def _repr_html_(self) -> str:
21
+ html = getattr(self.df, "_repr_html_", None)
22
+ return html() if callable(html) else self.df.__repr__()
23
+
24
+ def align(
25
+ self,
26
+ input_dir: str | pathlib.Path | None = None,
27
+ output_dir: str | pathlib.Path = "aligned",
28
+ nworks: int = 4,
29
+ cache: bool = False
30
+ ) -> "SatCubeMetadata":
31
+
32
+ if input_dir is None:
33
+ input_dir = self.raw_dir
34
+ else:
35
+ input_dir = pathlib.Path(input_dir).expanduser().resolve()
36
+
37
+ _align_fn(
38
+ input_dir=input_dir,
39
+ output_dir=output_dir,
40
+ nworks=nworks,
41
+ cache=cache
42
+ )
43
+ self.aligned_dir = pathlib.Path(output_dir).resolve()
44
+ return self
45
+
46
+ def cloud_masking(
47
+ self,
48
+ output_dir: str | pathlib.Path = "masked",
49
+ model_path: str | pathlib.Path = "SEN2CloudEnsemble",
50
+ device: str = "cpu"
51
+ ) -> "SatCubeMetadata":
52
+ if not hasattr(self, "aligned_dir"):
53
+ raise RuntimeError("You must run .align() first")
54
+ _cloud_fn(
55
+ input=self.aligned_dir,
56
+ output=output_dir,
57
+ model_path=model_path,
58
+ device=device
59
+ )
60
+ self.masked_dir = pathlib.Path(output_dir).resolve()
61
+ return self
62
+
63
+ def __getattr__(self, item):
64
+ return getattr(self.df, item)
65
+
66
+ def __getitem__(self, key):
67
+ return self.df.__getitem__(key)
68
+
69
+ def __len__(self):
70
+ return len(self.df)
71
+