satcube 0.1.17__py3-none-any.whl → 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of satcube might be problematic. Click here for more details.

@@ -24,8 +24,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
24
24
  from tqdm import tqdm
25
25
  import rasterio as rio
26
26
  from rasterio.merge import merge
27
- import shutil
28
-
29
27
  from satcube.utils import define_iteration, DeviceManager
30
28
  import warnings
31
29
  warnings.filterwarnings(
@@ -35,8 +33,6 @@ warnings.filterwarnings(
35
33
  module=r"huggingface_hub\.utils\._.*",
36
34
  )
37
35
 
38
-
39
-
40
36
  def infer_cloudmask(
41
37
  input_path: str | pathlib.Path,
42
38
  output_path: str | pathlib.Path,
@@ -45,38 +41,16 @@ def infer_cloudmask(
45
41
  chunk_size: int = 512,
46
42
  overlap: int = 32,
47
43
  device: str = "cpu",
48
- save_mask: bool = False,
49
- prefix: str = ""
44
+ save_mask: bool = True
50
45
  ) -> pathlib.Path:
51
- """
52
- Predict 'image_path' in overlapping patches of 'chunk_size' x 'chunk_size',
53
- but only write the valid (inner) region to avoid seam artifacts.
54
-
55
- This uses partial overlap logic:
56
- - For interior tiles, skip overlap//2 on each side.
57
- - For boundary tiles, we skip only the interior side to avoid losing data at the edges.
58
-
59
- Parameters
60
- ----------
61
- image_path : Path to input image.
62
- output_path : Path to output single-band mask.
63
- cloud_model : PyTorch model (already loaded with weights).
64
- chunk_size : Size of each tile to read from the source image (default 512).
65
- overlap : Overlap in pixels between adjacent tiles (default 32).
66
- device : "cpu" or "cuda:0".
67
-
68
- Returns
69
- -------
70
- pathlib.Path : The path to the created output image.
71
- """
72
-
73
- input_path = pathlib.Path(input_path)
74
- output_path = pathlib.Path(output_path)
46
+
47
+ input_path = pathlib.Path(input_path).expanduser().resolve()
48
+ output_path = pathlib.Path(output_path).expanduser().resolve()
75
49
 
76
50
  with rio.open(input_path) as src:
77
51
  meta = src.profile
78
- if not meta.get("tiled", False):
79
- raise ValueError("The input image is not marked as tiled in its metadata.")
52
+ # if not meta.get("tiled", False):
53
+ # raise ValueError("The input image is not marked as tiled in its metadata.")
80
54
  # Ensure the internal blocksize matches chunk_size
81
55
  if chunk_size % meta["blockxsize"] != 0 and meta["blockxsize"] <= chunk_size:
82
56
  raise ValueError(f"Image blocks must be {chunk_size}x{chunk_size}, "
@@ -93,9 +67,21 @@ def infer_cloudmask(
93
67
 
94
68
  window = Window(col_off, row_off, chunk_size, chunk_size)
95
69
  patch = src.read(window=window) / 1e4
96
- patch_tensor = torch.from_numpy(patch).float().unsqueeze(0).to(device)
97
- result = cloud_model(patch_tensor).cpu().numpy().astype(np.uint8)
98
-
70
+
71
+ patch_tensor = (
72
+ torch.from_numpy(patch)
73
+ .float()
74
+ .unsqueeze(0)
75
+ .to(device)
76
+ )
77
+
78
+ result = (
79
+ cloud_model(patch_tensor)
80
+ .cpu()
81
+ .numpy()
82
+ .astype(np.uint8)
83
+ )
84
+
99
85
  if col_off == 0:
100
86
  offset_x = 0
101
87
  else:
@@ -132,8 +118,7 @@ def infer_cloudmask(
132
118
  output_mask = output_path.parent / (output_path.stem + "_mask.tif")
133
119
  with rio.open(output_mask, "w", **out_meta) as dst:
134
120
  dst.write(full_mask, 1)
135
-
136
-
121
+
137
122
  data = src.read()
138
123
  img_prof = src.profile.copy()
139
124
 
@@ -144,57 +129,62 @@ def infer_cloudmask(
144
129
  with rio.open(output_path, "w", **img_prof) as dst:
145
130
  dst.write(masked)
146
131
 
147
- return output_path
132
+ flat = full_mask.astype(np.uint8).ravel()
133
+ counts = np.bincount(flat, minlength=4)
134
+ total = flat.size
135
+ percentages = {
136
+ "id": input_path.stem,
137
+ "clear_pct": counts[0] / total * 100.0,
138
+ "thin_cloud_pct": counts[1] / total * 100.0,
139
+ "cloud_shadow_pct": counts[2] / total * 100.0,
140
+ "thick_cloud_pct": counts[3] / total * 100.0,
141
+ }
142
+
143
+ return percentages
148
144
 
149
- def cloud_masking(
150
- input: str | pathlib.Path = "raw",
151
- output: str | pathlib.Path = "masked",
145
+
146
+ def cloud_fn(
147
+ metadata: pd.DataFrame | None = None,
148
+ input_dir: str | pathlib.Path | None = None,
149
+ output_dir: str | pathlib.Path = "masked",
152
150
  model_path: str | pathlib.Path = "SEN2CloudEnsemble",
153
151
  device: str = "cpu",
154
- save_mask: bool = False,
152
+ save_mask: bool = True,
153
+ cache: bool = False,
155
154
  nworks: int = 4,
156
- ) -> list[pathlib.Path]:
157
- """Write cloud-masked Sentinel-2 images.
158
-
159
- Parameters
160
- ----------
161
- input
162
- Path to a single ``.tif`` file **or** a directory containing them.
163
- output
164
- Destination directory (created i
165
- f missing).
166
- tile, pad
167
- Tile size and padding (pixels) when tiling is required.
168
- save_mask
169
- If *True*, store the binary mask alongside the masked image.
170
- device
171
- Torch device for inference, e.g. ``"cpu"`` or ``"cuda:0"``.
172
- max_pix_cpu
173
- Tile images larger than this when running on CPU.
174
-
175
- Returns
176
- ------
177
- list[pathlib.Path]
178
- Paths to the generated masked images.
179
- """
180
- src = pathlib.Path(input).expanduser().resolve()
181
- dst_dir = pathlib.Path(output).expanduser().resolve()
182
- dst_dir.mkdir(parents=True, exist_ok=True)
183
-
184
- # Collect files to process -------------------------------------------------
185
- tif_paths = []
186
- if src.is_dir():
187
- tif_paths = [p for p in src.rglob("*.tif")]
188
- elif src.is_file() and src.suffix.lower() == ".tif":
189
- tif_paths = [src]
190
- src = src.parent # for relative-path bookkeeping below
191
- else:
192
- raise ValueError(f"Input must be a .tif or directory, got: {src}")
193
-
194
- if not tif_paths:
195
- print(f"[cloud_masking] No .tif files found in {src}")
196
- return []
155
+ ) -> pd.DataFrame | None:
197
156
 
157
+ input_dir = pathlib.Path(input_dir).expanduser().resolve()
158
+ output_dir = pathlib.Path(output_dir).expanduser().resolve()
159
+ output_dir.mkdir(parents=True, exist_ok=True)
160
+
161
+ if metadata is None:
162
+ if not input_dir:
163
+ raise ValueError("Input directory must be specified.")
164
+ else:
165
+ if input_dir.is_dir():
166
+ tif_paths = [p for p in input_dir.rglob("*.tif")]
167
+ df = pd.DataFrame({
168
+ "id": [p.stem for p in tif_paths],
169
+ "path": [str(p) for p in tif_paths]
170
+ })
171
+ elif input_dir.is_file() and input_dir.suffix.lower() == ".tif":
172
+ tif_paths = [input_dir]
173
+ input_dir = input_dir.parent
174
+ else:
175
+ raise ValueError(f"Input must be a .tif or directory, got: {input_dir}")
176
+ else:
177
+ if not input_dir:
178
+ raise ValueError("Input directory must be specified.")
179
+ else:
180
+ df = metadata["id"].to_frame()
181
+ df["path"] = df["id"].apply(lambda x: str(input_dir / (x + ".tif")))
182
+
183
+
184
+ if cache:
185
+ exist_files = [file.stem for file in output_dir.glob("*.tif")]
186
+ df = df[~df["id"].isin(exist_files)]
187
+
198
188
  if not pathlib.Path(model_path).exists():
199
189
  mlstac.download(
200
190
  file = "https://huggingface.co/tacofoundation/CloudSEN12-models/resolve/main/SEN2CloudEnsemble/mlm.json",
@@ -202,22 +192,23 @@ def cloud_masking(
202
192
  )
203
193
 
204
194
  model = mlstac.load(model_path)
205
- cloud_model = DeviceManager(model, init_device=device).model
206
- cloud_model.eval()
195
+ cloud_model = DeviceManager(model, init_device=device).model.eval()
196
+
197
+ results_cloud = []
207
198
 
208
199
  with ThreadPoolExecutor(max_workers=nworks) as executor:
209
200
  futures = {
210
201
  executor.submit(
211
202
  infer_cloudmask,
212
- input_path=p,
213
- output_path=dst_dir / p.name,
203
+ input_path=p["path"],
204
+ output_path=output_dir / (p["id"] + ".tif"),
214
205
  cloud_model=cloud_model,
215
206
  device=device,
216
- save_mask=save_mask,
217
- prefix=f"[{i+1}/{len(tif_paths)}] "
218
- ): p for i, p in enumerate(tif_paths)
207
+ save_mask=save_mask
208
+ ): p for i, p in df.iterrows()
219
209
  }
220
210
 
211
+
221
212
  for future in tqdm(
222
213
  as_completed(futures),
223
214
  total=len(futures),
@@ -228,11 +219,25 @@ def cloud_masking(
228
219
  p = futures[future]
229
220
  try:
230
221
  result = future.result()
231
- print(f"{result} processed successfully.")
222
+ results_cloud.append(result)
232
223
  except Exception as e:
233
224
  print(f"Error processing {p}: {e}")
234
225
 
235
- metadata = src / "metadata.csv"
236
- if metadata.exists():
237
- metadata_dst = dst_dir / "metadata.csv"
238
- shutil.copy(metadata, metadata_dst)
226
+ cloud_df = pd.DataFrame(results_cloud)
227
+
228
+ if cloud_df.empty:
229
+ return metadata
230
+
231
+ metadata = metadata.drop(
232
+ columns=["clear_pct","thin_cloud_pct", "cloud_shadow_pct", "thick_cloud_pct"],
233
+ errors="ignore"
234
+ )
235
+
236
+ metadata = metadata.merge(
237
+ cloud_df,
238
+ on="id",
239
+ how="left",
240
+ suffixes=('', '')
241
+ )
242
+
243
+ return metadata
satcube/composite.py ADDED
@@ -0,0 +1,85 @@
1
+ import pathlib
2
+ from typing import Tuple
3
+ import numpy as np
4
+ import pandas as pd
5
+ import rasterio as rio
6
+ from concurrent.futures import ProcessPoolExecutor, as_completed
7
+
8
+
9
+ def monthly_composites_s2(
10
+ metadata: pd.DataFrame | None = None,
11
+ input_dir: str | pathlib.Path | None = None,
12
+ output_dir: str | pathlib.Path = "monthly_composites",
13
+ date_range: Tuple[str, str] = ("2018-06-01", "2020-01-01"),
14
+ agg_method: str = "median",
15
+ ):
16
+
17
+ input_dir = pathlib.Path(input_dir).expanduser().resolve()
18
+ output_dir = pathlib.Path(output_dir).expanduser().resolve()
19
+ output_dir.mkdir(parents=True, exist_ok=True)
20
+
21
+ all_raw_files = [input_dir / f for f in input_dir.glob("*.tif") if f.is_file()]
22
+
23
+ with rio.open(all_raw_files[0]) as src:
24
+ profile = src.profile
25
+
26
+
27
+ all_raw_dates = pd.to_datetime(metadata["date"])
28
+ all_raw_date_min = pd.to_datetime(date_range[0])
29
+ all_raw_date_max = pd.to_datetime(date_range[1])
30
+ all_raw_dates_unique = pd.date_range(
31
+ all_raw_date_min, all_raw_date_max, freq="MS"
32
+ ) + pd.DateOffset(days=14)
33
+ all_raw_dates_unique = all_raw_dates_unique.strftime("%Y-%m-15")
34
+
35
+ # Aggregate the data considering the method and dates
36
+ new_table = []
37
+ for idx, date in enumerate(all_raw_dates_unique):
38
+
39
+ # Get the images to aggregate
40
+ idxs = all_raw_dates.dt.strftime("%Y-%m-15") == date
41
+ images = [all_raw_files[i] for i in np.where(idxs)[0]]
42
+
43
+ if len(images) == 0:
44
+ data = np.ones((profile["count"], profile["height"], profile["width"]))
45
+ data = 65535 * data
46
+ nodata = 1
47
+ profile_image = profile
48
+ else:
49
+ # Read the images
50
+ container = []
51
+ for image in images:
52
+ with rio.open(image) as src:
53
+ data = src.read()
54
+ profile_image = src.profile
55
+ container.append(data)
56
+
57
+ # Aggregate the data
58
+ if agg_method == "mean":
59
+ data = np.mean(container, axis=0)
60
+ elif agg_method == "median":
61
+ data = np.median(container, axis=0)
62
+ elif agg_method == "max":
63
+ data = np.max(container, axis=0)
64
+ elif agg_method == "min":
65
+ data = np.min(container, axis=0)
66
+ else:
67
+ raise ValueError("Invalid aggregation method")
68
+
69
+ nodata = 0
70
+
71
+ # Save the image
72
+ with rio.open(output_dir / f"{date}.tif", "w", **profile_image) as dst:
73
+ dst.write(data.astype(rio.uint16))
74
+
75
+
76
+ meta_dict = {
77
+ "outname": f"{date}.tif",
78
+ "date": date,
79
+ "nodata": nodata,
80
+ }
81
+
82
+ new_table.append(meta_dict)
83
+
84
+
85
+ return pd.DataFrame(new_table)
satcube/download.py CHANGED
@@ -17,7 +17,6 @@ def download(
17
17
  nworks: int = 4
18
18
  ) -> "SatCubeMetadata":
19
19
 
20
-
21
20
  outfolder = pathlib.Path(outfolder).resolve()
22
21
 
23
22
  table = ce.s2_table(
@@ -57,12 +56,10 @@ def download(
57
56
  .reset_index()
58
57
  )
59
58
 
60
- table_final = table_req.merge(
59
+ df = table_req.merge(
61
60
  result_table,
62
61
  on='date',
63
62
  how='left'
64
63
  ).rename(columns={'id_x': 'id', 'id_y': 'gee_ids'})
65
64
 
66
- table_final.to_csv(outfolder / "metadata.csv", index=False)
67
-
68
- return SatCubeMetadata(df=table_final, raw_dir=outfolder)
65
+ return SatCubeMetadata(df=df, raw_dir=outfolder)
satcube/gapfill.py ADDED
@@ -0,0 +1,216 @@
1
+ # satcube/gapfill.py
2
+ from __future__ import annotations
3
+
4
+ import pathlib, shutil
5
+ from typing import Literal, List, Tuple
6
+ import numpy as np
7
+ import pandas as pd
8
+ import rasterio as rio
9
+ from tqdm import tqdm
10
+
11
+ from sklearn.linear_model import LinearRegression
12
+
13
+ _GAP_METHOD = Literal["histogram_matching", "linear"]
14
+
15
+
16
+
17
+
18
+ def linear_interpolation(
19
+ image1: np.ndarray, image2: np.ndarray, image3: np.ndarray
20
+ ) -> np.ndarray:
21
+ """Apply linear interpolation to image3 using image1 and image2 as
22
+ reference images.
23
+
24
+ Args:
25
+ image1 (np.ndarray): The first reference image.
26
+ image2 (np.ndarray): The second reference image.
27
+ image3 (np.ndarray): The image to be matched.
28
+
29
+ Returns:
30
+ np.ndarray: The matched image.
31
+ """
32
+
33
+ # remove nan values
34
+ image1_nonan = image1.flatten().copy()
35
+ image1_nonan = image1_nonan[~np.isnan(image1_nonan)]
36
+
37
+ image2_nonan = image2.flatten().copy()
38
+ image2_nonan = image2_nonan[~np.isnan(image2_nonan)]
39
+
40
+ # Calculate the slope and intercept
41
+ linreg = LinearRegression()
42
+ linreg.fit(image2_nonan[:, np.newaxis], image1_nonan[:, np.newaxis])
43
+ slope = linreg.coef_[0]
44
+ intercept = linreg.intercept_
45
+
46
+ # Apply the linear interpolation
47
+ image3_matched = slope * image3 + intercept
48
+
49
+ return image3_matched
50
+
51
+
52
+ def tripple_histogram_matching(
53
+ image1: np.ndarray, image2: np.ndarray, image3: np.ndarray
54
+ ) -> np.ndarray:
55
+ """Apply histogram matching to image3 using image1 and image2 as reference images.
56
+
57
+ Args:
58
+ image1 (np.ndarray): The first reference image.
59
+ image2 (np.ndarray): The second reference image.
60
+ image3 (np.ndarray): The image to be matched.
61
+
62
+ Returns:
63
+ np.ndarray: The matched image.
64
+ """
65
+
66
+ # remove nan values
67
+ image1_nonan = image1.flatten().copy()
68
+ image1_nonan = image1_nonan[~np.isnan(image1_nonan)]
69
+
70
+ image2_nonan = image2.flatten().copy()
71
+ image2_nonan = image2_nonan[~np.isnan(image2_nonan)]
72
+
73
+ image3_nonan = image3.flatten().copy()
74
+ image3_nonan = image3_nonan[~np.isnan(image3_nonan)]
75
+
76
+ # Calculate histograms
77
+ hist1, bins = np.histogram(image1_nonan, 128, [0, 2])
78
+ hist2, bins = np.histogram(image2_nonan, 128, [0, 2])
79
+ hist3, bins = np.histogram(image3_nonan, 128, [0, 2])
80
+
81
+ # Calculate the cumulative distribution function (CDF) of img1
82
+ cdf1 = hist1.cumsum() / hist1.sum()
83
+
84
+ # Calculate the CDF of img2
85
+ cdf2 = hist2.cumsum() / hist2.sum()
86
+
87
+ # Create a lookup table (LUT) to map the pixel values of img1 to img2
88
+ lut = np.interp(cdf2, cdf1, bins[:-1])
89
+
90
+ # Perform histogram matching
91
+ img3_matched = np.interp(image3.ravel(), bins[:-1], lut).reshape(image3.shape)
92
+
93
+ return img3_matched
94
+
95
+
96
+ def _fill_one(
97
+ img_path: pathlib.Path,
98
+ ref_paths: List[pathlib.Path],
99
+ dates: np.ndarray,
100
+ this_date: np.datetime64,
101
+ *,
102
+ method: _GAP_METHOD,
103
+ out_dir: pathlib.Path,
104
+ quiet: bool
105
+ ) -> float:
106
+ """Gap‑fill a single S2 scene; return error metric."""
107
+ with rio.open(img_path) as src:
108
+ data = src.read() / 1e4
109
+ prof = src.profile
110
+ data[data == 6.5535] = np.nan
111
+ cloudmask = np.isnan(data).mean(0)
112
+
113
+ if cloudmask.sum() == 0: # imagen limpia: copia sin procesar
114
+ shutil.copy(img_path, out_dir / img_path.name)
115
+ return 0.0
116
+
117
+ # ordenar todas las demás por cercanía temporal
118
+ idxs = np.argsort(np.abs(dates - this_date))
119
+ best_img, best_metric = None, np.inf
120
+ tries = 0
121
+
122
+ for i in idxs:
123
+ if tries == 5: # máximo 5 intentos
124
+ break
125
+ ref_path = ref_paths[i]
126
+ if ref_path == img_path:
127
+ continue
128
+
129
+ with rio.open(ref_path) as src:
130
+ ref = src.read() / 1e4
131
+ ref[ref == 6.5535] = np.nan
132
+ ref_mask = np.isnan(ref) * 1.0
133
+
134
+ # descartar ref con nubes superpuestas
135
+ if np.sum((ref_mask + cloudmask) == 2) != 0:
136
+ continue
137
+
138
+ full_mask = ((cloudmask + ref_mask) > 0).astype(float)
139
+ data_masked = np.where(full_mask, np.nan, data)
140
+ ref_masked = np.where(full_mask, np.nan, ref)
141
+
142
+ filled = np.zeros_like(data)
143
+ for b in range(data.shape[0]):
144
+ if method == "histogram_matching":
145
+ filled[b] = tripple_histogram_matching(data_masked[b], ref_masked[b], ref[b])
146
+ else: # "linear"
147
+ filled[b] = linear_interpolation(data_masked[b], ref_masked[b], ref[b])
148
+
149
+ # calcular métrica
150
+ a = filled[[2, 1, 0]].mean(0)
151
+ b = data[[2, 1, 0]].mean(0)
152
+ metric = np.nanmean(np.abs(a - b) / (a + b))
153
+
154
+ if metric < best_metric:
155
+ best_metric = metric
156
+ best_img = filled
157
+
158
+ tries += 1
159
+
160
+ if best_img is None: # no suitable ref found
161
+ if not quiet:
162
+ print(f"{img_path.name}: no cloud‑free neighbour found – copied.")
163
+ shutil.copy(img_path, out_dir / img_path.name)
164
+ return np.nan # could also return 0.0
165
+
166
+ # Ensure float dtype for isnan()
167
+ if best_img.dtype.kind in "iu": # i = int, u = uint
168
+ best_img = best_img.astype(np.float32)
169
+
170
+ # Combine and save
171
+ best_img[np.isnan(best_img)] = 0
172
+ data[np.isnan(data)] = 0
173
+ final = data + best_img * full_mask
174
+ final[final < 0] = 0
175
+ final = (final * 1e4).astype(np.uint16)
176
+
177
+ with rio.open(out_dir / img_path.name, "w", **prof) as dst:
178
+ dst.write(final)
179
+
180
+ if not quiet:
181
+ print(f"{img_path.name} gap‑filled (error={best_metric:.4f})")
182
+
183
+ return float(best_metric)
184
+
185
+
186
+ def gapfill_fn( # ← wrapper estilo align_fn
187
+ metadata: pd.DataFrame,
188
+ input_dir: str | pathlib.Path,
189
+ output_dir: str | pathlib.Path = "gapfilled",
190
+ *,
191
+ method: _GAP_METHOD = "histogram_matching",
192
+ quiet: bool = False
193
+ ) -> pd.DataFrame:
194
+ """Gap‑fill every image listed in *metadata*.
195
+
196
+ Returns
197
+ -------
198
+ pd.DataFrame
199
+ Original dataframe + column ``match_error``.
200
+ """
201
+ input_dir = pathlib.Path(input_dir).expanduser().resolve()
202
+ output_dir = pathlib.Path(output_dir).expanduser().resolve()
203
+ output_dir.mkdir(parents=True, exist_ok=True)
204
+
205
+ img_paths = [input_dir / f"{i}.tif" for i in metadata["id"]]
206
+ dates = pd.to_datetime(metadata["date"]).to_numpy()
207
+
208
+ errors: List[float] = []
209
+ for i, img in enumerate(tqdm(img_paths, desc="Gap‑filling", unit="img")):
210
+ err = _fill_one(img, img_paths, dates, dates[i],
211
+ method=method, out_dir=output_dir, quiet=quiet)
212
+ errors.append(err)
213
+
214
+ metadata = metadata.drop(columns=["match_error"], errors="ignore")
215
+ metadata["match_error"] = errors
216
+ return metadata