ssb-sgis 1.0.6__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,6 @@ import datetime
2
2
  import functools
3
3
  import glob
4
4
  import itertools
5
- import math
6
5
  import os
7
6
  import random
8
7
  import re
@@ -11,6 +10,7 @@ from collections.abc import Callable
11
10
  from collections.abc import Iterable
12
11
  from collections.abc import Iterator
13
12
  from collections.abc import Sequence
13
+ from concurrent.futures import ThreadPoolExecutor
14
14
  from copy import deepcopy
15
15
  from dataclasses import dataclass
16
16
  from pathlib import Path
@@ -27,6 +27,7 @@ from affine import Affine
27
27
  from geopandas import GeoDataFrame
28
28
  from geopandas import GeoSeries
29
29
  from matplotlib.colors import LinearSegmentedColormap
30
+ from pandas.api.types import is_dict_like
30
31
  from rasterio.enums import MergeAlg
31
32
  from scipy import stats
32
33
  from scipy.ndimage import binary_dilation
@@ -88,8 +89,10 @@ except ImportError:
88
89
  from ..geopandas_tools.bounds import get_total_bounds
89
90
  from ..geopandas_tools.conversion import to_bbox
90
91
  from ..geopandas_tools.conversion import to_gdf
92
+ from ..geopandas_tools.conversion import to_geoseries
91
93
  from ..geopandas_tools.conversion import to_shapely
92
94
  from ..geopandas_tools.general import get_common_crs
95
+ from ..helpers import _fix_path
93
96
  from ..helpers import get_all_files
94
97
  from ..helpers import get_numpy_func
95
98
  from ..io._is_dapla import is_dapla
@@ -101,7 +104,6 @@ from .base import _get_shape_from_bounds
101
104
  from .base import _get_transform_from_bounds
102
105
  from .base import get_index_mapper
103
106
  from .indices import ndvi
104
- from .regex import _any_regex_matches
105
107
  from .regex import _extract_regex_match_from_string
106
108
  from .regex import _get_first_group_match
107
109
  from .regex import _get_non_optional_groups
@@ -157,14 +159,21 @@ ALLOWED_INIT_KWARGS = [
157
159
  "band_class",
158
160
  "image_regexes",
159
161
  "filename_regexes",
160
- "bounds_regexes",
161
162
  "all_bands",
162
163
  "crs",
164
+ "backend",
163
165
  "masking",
164
166
  "_merged",
165
- "_add_metadata_attributes",
166
167
  ]
167
168
 
169
+ _load_counter: int = 0
170
+
171
+
172
+ def _get_child_paths_threaded(data: Sequence[str]) -> set[str]:
173
+ with ThreadPoolExecutor() as executor:
174
+ all_paths: Iterator[set[str]] = executor.map(_ls_func, data)
175
+ return set(itertools.chain.from_iterable(all_paths))
176
+
168
177
 
169
178
  class ImageCollectionGroupBy:
170
179
  """Iterator and merger class returned from groupby.
@@ -216,7 +225,6 @@ class ImageCollectionGroupBy:
216
225
 
217
226
  collection = ImageCollection(
218
227
  images,
219
- # TODO band_class?
220
228
  level=self.collection.level,
221
229
  **self.collection._common_init_kwargs,
222
230
  )
@@ -254,7 +262,6 @@ class ImageCollectionGroupBy:
254
262
 
255
263
  image = Image(
256
264
  bands,
257
- # TODO band_class?
258
265
  **self.collection._common_init_kwargs,
259
266
  )
260
267
  image._merged = True
@@ -284,49 +291,20 @@ class ImageCollectionGroupBy:
284
291
  return f"{self.__class__.__name__}({len(self)})"
285
292
 
286
293
 
287
- def standardize_band_id(x: str) -> str:
288
- return x.replace("B", "").replace("A", "").zfill(2)
289
-
290
-
291
- class BandIdDict(dict):
292
- """Dict that tells the band initialiser to get the dict value of the band_id."""
293
-
294
- def __init__(self, data: dict | None = None, **kwargs) -> None:
295
- """Add dicts or kwargs."""
296
- self._standardized_keys = {}
297
- for key, value in ((data or {}) | kwargs).items():
298
- setattr(self, key, value)
299
- self._standardized_keys[standardize_band_id(key)] = value
300
-
301
- def __len__(self) -> int:
302
- """Number of items."""
303
- return len({key for key in self.__dict__ if key != "_standardized_keys"})
304
-
305
- def __getitem__(self, item: str) -> Any:
306
- """Get dict value from key."""
307
- try:
308
- return getattr(self, item)
309
- except AttributeError as e:
310
- try:
311
- return self._standardized_keys[standardize_band_id(item)]
312
- except KeyError:
313
- raise KeyError(item, self.__dict__) from e
314
-
315
-
316
294
  @dataclass(frozen=True)
317
295
  class BandMasking:
318
296
  """Frozen dict with forced keys."""
319
297
 
320
298
  band_id: str
321
- values: tuple[int]
299
+ values: Sequence[int] | dict[int, Any]
322
300
 
323
301
  def __getitem__(self, item: str) -> Any:
324
302
  """Index into attributes to mimick dict."""
325
303
  return getattr(self, item)
326
304
 
327
305
 
328
- class NoLevel:
329
- """Equivelant to None."""
306
+ class None_:
307
+ """Default value for keyword arguments that should not have a default."""
330
308
 
331
309
 
332
310
  class _ImageBase:
@@ -335,7 +313,7 @@ class _ImageBase:
335
313
  metadata_attributes: ClassVar[dict | None] = None
336
314
  masking: ClassVar[BandMasking | None] = None
337
315
 
338
- def __init__(self, *, bbox=None, **kwargs) -> None:
316
+ def __init__(self, *, metadata=None, bbox=None, **kwargs) -> None:
339
317
 
340
318
  self._mask = None
341
319
  self._bounds = None
@@ -344,9 +322,12 @@ class _ImageBase:
344
322
  self._from_gdf = False
345
323
  self.metadata_attributes = self.metadata_attributes or {}
346
324
  self._path = None
325
+ self._metadata_from_xml = False
347
326
 
348
327
  self._bbox = to_bbox(bbox) if bbox is not None else None
349
328
 
329
+ self.metadata = self._metadata_to_nested_dict(metadata)
330
+
350
331
  if self.filename_regexes:
351
332
  if isinstance(self.filename_regexes, str):
352
333
  self.filename_regexes = (self.filename_regexes,)
@@ -374,14 +355,45 @@ class _ImageBase:
374
355
  f"{self.__class__.__name__} got an unexpected keyword argument '{key}'"
375
356
  )
376
357
 
358
+ @staticmethod
359
+ def _metadata_to_nested_dict(
360
+ metadata: str | Path | os.PathLike | dict | pd.DataFrame | None,
361
+ ) -> dict[str, dict[str, Any]] | None:
362
+ if metadata is None:
363
+ return {}
364
+ if isinstance(metadata, (str | Path | os.PathLike)):
365
+ metadata = _read_parquet_func(metadata)
366
+
367
+ if isinstance(metadata, pd.DataFrame):
368
+
369
+ def is_scalar(x) -> bool:
370
+ return not hasattr(x, "__len__") or len(x) <= 1
371
+
372
+ def na_to_none(x) -> None:
373
+ """Convert to None rowwise because pandas doesn't always."""
374
+ return x if not (is_scalar(x) and pd.isna(x)) else None
375
+
376
+ # to nested dict because pandas indexing gives rare KeyError with long strings
377
+ metadata = {
378
+ _fix_path(path): {
379
+ attr: na_to_none(value) for attr, value in row.items()
380
+ }
381
+ for path, row in metadata.iterrows()
382
+ }
383
+ elif is_dict_like(metadata):
384
+ metadata = {_fix_path(path): value for path, value in metadata.items()}
385
+
386
+ return metadata
387
+
377
388
  @property
378
389
  def _common_init_kwargs(self) -> dict:
379
390
  return {
380
- "file_system": self.file_system,
381
391
  "processes": self.processes,
382
392
  "res": self.res,
383
393
  "bbox": self._bbox,
384
394
  "nodata": self.nodata,
395
+ "backend": self.backend,
396
+ "metadata": self.metadata,
385
397
  }
386
398
 
387
399
  @property
@@ -401,6 +413,14 @@ class _ImageBase:
401
413
  """Centerpoint of the object."""
402
414
  return self.union_all().centroid
403
415
 
416
+ def assign(self, **kwargs) -> "_ImageBase":
417
+ for key, value in kwargs.items():
418
+ try:
419
+ setattr(self, key, value)
420
+ except AttributeError:
421
+ setattr(self, f"_{key}", value)
422
+ return self
423
+
404
424
  def _name_regex_searcher(
405
425
  self, group: str, patterns: tuple[re.Pattern]
406
426
  ) -> str | None:
@@ -411,18 +431,28 @@ class _ImageBase:
411
431
  return _get_first_group_match(pat, self.name)[group]
412
432
  except (TypeError, KeyError):
413
433
  pass
434
+ if isinstance(self, Band):
435
+ for pat in patterns:
436
+ try:
437
+ return _get_first_group_match(
438
+ pat, str(Path(self.path).parent.name)
439
+ )[group]
440
+ except (TypeError, KeyError):
441
+ pass
414
442
  if not any(group in _get_non_optional_groups(pat) for pat in patterns):
415
443
  return None
444
+ band_text = (
445
+ f" or {Path(self.path).parent.name!s}" if isinstance(self, Band) else ""
446
+ )
416
447
  raise ValueError(
417
- f"Couldn't find group '{group}' in name {self.name} with regex patterns {patterns}"
448
+ f"Couldn't find group '{group}' in name {self.name}{band_text} with regex patterns {patterns}"
418
449
  )
419
450
 
420
- def _create_metadata_df(self, file_paths: list[str]) -> pd.DataFrame:
451
+ def _create_metadata_df(self, file_paths: Sequence[str]) -> pd.DataFrame:
421
452
  """Create a dataframe with file paths and image paths that match regexes."""
422
- df = pd.DataFrame({"file_path": file_paths})
453
+ df = pd.DataFrame({"file_path": list(file_paths)})
423
454
 
424
- df["file_path"] = df["file_path"].apply(_fix_path)
425
- df["filename"] = df["file_path"].apply(lambda x: Path(x).name)
455
+ df["file_name"] = df["file_path"].apply(lambda x: Path(x).name)
426
456
 
427
457
  df["image_path"] = df["file_path"].apply(
428
458
  lambda x: _fix_path(str(Path(x).parent))
@@ -434,20 +464,20 @@ class _ImageBase:
434
464
  df = df[~df["file_path"].isin(df["image_path"])]
435
465
 
436
466
  if self.filename_patterns:
437
- df = _get_regexes_matches_for_df(df, "filename", self.filename_patterns)
467
+ df = _get_regexes_matches_for_df(df, "file_name", self.filename_patterns)
438
468
 
439
469
  if not len(df):
440
470
  return df
441
471
 
442
472
  grouped = df.drop_duplicates("image_path").set_index("image_path")
443
- for col in ["file_path", "filename"]:
473
+ for col in ["file_path", "file_name"]:
444
474
  if col in df:
445
475
  grouped[col] = df.groupby("image_path")[col].apply(tuple)
446
476
 
447
477
  grouped = grouped.reset_index()
448
478
  else:
449
479
  df["file_path"] = df.groupby("image_path")["file_path"].apply(tuple)
450
- df["filename"] = df.groupby("image_path")["filename"].apply(tuple)
480
+ df["file_name"] = df.groupby("image_path")["file_name"].apply(tuple)
451
481
  grouped = df.drop_duplicates("image_path")
452
482
 
453
483
  grouped["imagename"] = grouped["image_path"].apply(
@@ -521,7 +551,7 @@ class _ImageBandBase(_ImageBase):
521
551
  return self._name
522
552
  try:
523
553
  return Path(self.path).name
524
- except (ValueError, AttributeError):
554
+ except (ValueError, AttributeError, TypeError):
525
555
  return None
526
556
 
527
557
  @name.setter
@@ -532,22 +562,31 @@ class _ImageBandBase(_ImageBase):
532
562
  def stem(self) -> str | None:
533
563
  try:
534
564
  return Path(self.path).stem
535
- except (AttributeError, ValueError):
565
+ except (AttributeError, ValueError, TypeError):
536
566
  return None
537
567
 
538
568
  @property
539
569
  def level(self) -> str:
540
570
  return self._name_regex_searcher("level", self.image_patterns)
541
571
 
542
- def _add_metadata_attributes(self):
572
+ def _get_metadata_attributes(self, metadata_attributes: dict) -> dict:
543
573
 
544
- missing_attributes = {}
545
- for key, value in self.metadata_attributes.items():
546
- if getattr(self, key) is None:
547
- missing_attributes[key] = value
574
+ self._metadata_from_xml = True
548
575
 
549
- if not missing_attributes:
550
- return
576
+ missing_metadata_attributes = {
577
+ key: value
578
+ for key, value in metadata_attributes.items()
579
+ if not hasattr(self, key) or getattr(self, key) is None
580
+ }
581
+
582
+ nonmissing_metadata_attributes = {
583
+ key: getattr(self, key)
584
+ for key in metadata_attributes
585
+ if key not in missing_metadata_attributes
586
+ }
587
+
588
+ if not missing_metadata_attributes:
589
+ return nonmissing_metadata_attributes
551
590
 
552
591
  file_contents: list[str] = []
553
592
  for path in self._all_file_paths:
@@ -556,7 +595,7 @@ class _ImageBandBase(_ImageBase):
556
595
  with _open_func(path, "rb") as file:
557
596
  file_contents.append(file.read().decode("utf-8"))
558
597
 
559
- for key, value in missing_attributes.items():
598
+ for key, value in missing_metadata_attributes.items():
560
599
  results = None
561
600
  for i, filetext in enumerate(file_contents):
562
601
  if isinstance(value, str) and value in dir(self):
@@ -586,16 +625,46 @@ class _ImageBandBase(_ImageBase):
586
625
  if i == len(self._all_file_paths) - 1:
587
626
  raise e
588
627
 
589
- if isinstance(results, BandIdDict) and isinstance(self, Band):
590
- results = results[self.band_id]
628
+ missing_metadata_attributes[key] = results
591
629
 
592
- setattr(self, key, results)
630
+ return missing_metadata_attributes | nonmissing_metadata_attributes
631
+
632
+ def _to_xarray(self, array: np.ndarray, transform: Affine) -> DataArray:
633
+ """Convert the raster to an xarray.DataArray."""
634
+ if len(array.shape) == 2:
635
+ height, width = array.shape
636
+ dims = ["y", "x"]
637
+ elif len(array.shape) == 3:
638
+ height, width = array.shape[1:]
639
+ dims = ["band", "y", "x"]
640
+ else:
641
+ raise ValueError(
642
+ f"Array should be 2 or 3 dimensional. Got shape {array.shape}"
643
+ )
644
+
645
+ coords = _generate_spatial_coords(transform, width, height)
646
+
647
+ attrs = {"crs": self.crs}
648
+ for attr in set(self.metadata_attributes).union({"date"}):
649
+ try:
650
+ attrs[attr] = getattr(self, attr)
651
+ except Exception:
652
+ pass
653
+
654
+ return DataArray(
655
+ array,
656
+ coords=coords,
657
+ dims=dims,
658
+ name=self.name or self.__class__.__name__,
659
+ attrs=attrs,
660
+ )
593
661
 
594
662
 
595
663
  class Band(_ImageBandBase):
596
664
  """Band holding a single 2 dimensional array representing an image band."""
597
665
 
598
666
  cmap: ClassVar[str | None] = None
667
+ backend: str = "numpy"
599
668
 
600
669
  @classmethod
601
670
  def from_gdf(
@@ -627,13 +696,12 @@ class Band(_ImageBandBase):
627
696
 
628
697
  def __init__(
629
698
  self,
630
- data: str | np.ndarray,
631
- res: int | None,
699
+ data: str | np.ndarray | None = None,
700
+ res: int | None_ = None_,
632
701
  crs: Any | None = None,
633
702
  bounds: tuple[float, float, float, float] | None = None,
634
703
  nodata: int | None = None,
635
704
  mask: "Band | None" = None,
636
- file_system: GCSFileSystem | None = None,
637
705
  processes: int = 1,
638
706
  name: str | None = None,
639
707
  band_id: str | None = None,
@@ -642,6 +710,16 @@ class Band(_ImageBandBase):
642
710
  **kwargs,
643
711
  ) -> None:
644
712
  """Band initialiser."""
713
+ if callable(res) and isinstance(res(), None_):
714
+ raise TypeError("Must specify 'res'")
715
+
716
+ if data is None:
717
+ # allowing 'path' to replace 'data' as argument
718
+ # to make the print repr. valid as initialiser
719
+ if "path" not in kwargs:
720
+ raise TypeError("Must specify either 'data' or 'path'.")
721
+ data = kwargs.pop("path")
722
+
645
723
  super().__init__(**kwargs)
646
724
 
647
725
  if isinstance(data, (str | Path | os.PathLike)) and any(
@@ -657,20 +735,13 @@ class Band(_ImageBandBase):
657
735
  self._bounds = bounds
658
736
  self._all_file_paths = all_file_paths
659
737
 
660
- self._image = None
661
-
662
- for key in self.metadata_attributes:
663
- setattr(self, key, None)
664
-
665
738
  if isinstance(data, np.ndarray):
666
- self.values = data
667
739
  if self._bounds is None:
668
740
  raise ValueError("Must specify bounds when data is an array.")
669
741
  self._crs = crs
670
- self.transform = _get_transform_from_bounds(
671
- self._bounds, shape=self.values.shape
672
- )
742
+ self.transform = _get_transform_from_bounds(self._bounds, shape=data.shape)
673
743
  self._from_array = True
744
+ self.values = data
674
745
 
675
746
  elif not isinstance(data, (str | Path | os.PathLike)):
676
747
  raise TypeError(
@@ -678,44 +749,48 @@ class Band(_ImageBandBase):
678
749
  f"Got {type(data)}"
679
750
  )
680
751
  else:
681
- self._path = str(data)
752
+ self._path = _fix_path(str(data))
682
753
 
683
754
  self._res = res
684
755
  if cmap is not None:
685
756
  self.cmap = cmap
686
- self.file_system = file_system
687
757
  self._name = name
688
758
  self._band_id = band_id
689
759
  self.processes = processes
690
760
 
691
- if (
692
- kwargs.get("_add_metadata_attributes", True)
693
- and self.metadata_attributes
694
- and self.path is not None
695
- ):
761
+ if self._all_file_paths:
762
+ self._all_file_paths = {_fix_path(path) for path in self._all_file_paths}
763
+ parent = _fix_path(Path(self.path).parent)
764
+ self._all_file_paths = {
765
+ path for path in self._all_file_paths if parent in path
766
+ }
767
+
768
+ if self.metadata:
769
+ if self.path is not None:
770
+ self.metadata = {
771
+ key: value
772
+ for key, value in self.metadata.items()
773
+ if key == self.path
774
+ }
775
+ this_metadata = self.metadata[self.path]
776
+ for key, value in this_metadata.items():
777
+ if key in dir(self):
778
+ setattr(self, f"_{key}", value)
779
+ else:
780
+ setattr(self, key, value)
781
+
782
+ elif self.metadata_attributes and self.path is not None and not self.is_mask:
696
783
  if self._all_file_paths is None:
697
784
  self._all_file_paths = _get_all_file_paths(str(Path(self.path).parent))
698
- self._add_metadata_attributes()
785
+ for key, value in self._get_metadata_attributes(
786
+ self.metadata_attributes
787
+ ).items():
788
+ setattr(self, key, value)
699
789
 
700
790
  def __lt__(self, other: "Band") -> bool:
701
791
  """Makes Bands sortable by band_id."""
702
792
  return self.band_id < other.band_id
703
793
 
704
- # def __getattribute__(self, attr: str) -> Any:
705
- # # try:
706
- # # value =
707
- # # except AttributeError:
708
- # # value = None
709
-
710
- # if (
711
- # attr in (super().__getattribute__("metadata_attributes") or {})
712
- # and super().__getattribute__(attr) is None
713
- # ):
714
- # if self._all_file_paths is None:
715
- # self._all_file_paths = _get_all_file_paths(str(Path(self.path).parent))
716
- # self._add_metadata_attributes()
717
- # return super().__getattribute__(attr)
718
-
719
794
  @property
720
795
  def values(self) -> np.ndarray:
721
796
  """The numpy array, if loaded."""
@@ -725,23 +800,35 @@ class Band(_ImageBandBase):
725
800
 
726
801
  @values.setter
727
802
  def values(self, new_val):
728
- if not isinstance(new_val, np.ndarray):
729
- raise TypeError(
730
- f"{self.__class__.__name__} 'values' must be np.ndarray. Got {type(new_val)}"
731
- )
732
- self._values = new_val
803
+ if self.backend == "numpy" and isinstance(new_val, np.ndarray):
804
+ self._values = new_val
805
+ return
806
+ elif self.backend == "xarray" and isinstance(new_val, DataArray):
807
+ # attrs can dissappear, so doing a union
808
+ attrs = self._values.attrs | new_val.attrs
809
+ self._values = new_val
810
+ self._values.attrs = attrs
811
+ return
812
+
813
+ if self.backend == "numpy":
814
+ self._values = self._to_numpy(new_val)
815
+ if self.backend == "xarray":
816
+ if not isinstance(self._values, DataArray):
817
+ self._values = self._to_xarray(
818
+ new_val,
819
+ transform=self.transform,
820
+ )
821
+
822
+ elif isinstance(new_val, np.ndarray):
823
+ self._values.values = new_val
824
+ else:
825
+ self._values = new_val
733
826
 
734
827
  @property
735
828
  def mask(self) -> "Band":
736
829
  """Mask Band."""
737
830
  return self._mask
738
831
 
739
- @mask.setter
740
- def mask(self, values: "Band") -> None:
741
- if values is not None and not isinstance(values, Band):
742
- raise TypeError(f"'mask' should be of type Band. Got {type(values)}")
743
- self._mask = values
744
-
745
832
  @property
746
833
  def band_id(self) -> str:
747
834
  """Band id."""
@@ -779,11 +866,11 @@ class Band(_ImageBandBase):
779
866
  )
780
867
 
781
868
  @property
782
- def crs(self) -> str | None:
869
+ def crs(self) -> pyproj.CRS | None:
783
870
  """Coordinate reference system."""
784
871
  if self._crs is None:
785
872
  self._add_crs_and_bounds()
786
- return self._crs
873
+ return pyproj.CRS(self._crs)
787
874
 
788
875
  @property
789
876
  def bounds(self) -> tuple[int, int, int, int] | None:
@@ -793,7 +880,7 @@ class Band(_ImageBandBase):
793
880
  return self._bounds
794
881
 
795
882
  def _add_crs_and_bounds(self) -> None:
796
- with opener(self.path, file_system=self.file_system) as file:
883
+ with opener(self.path) as file:
797
884
  with rasterio.open(file) as src:
798
885
  self._bounds = to_bbox(src.bounds)
799
886
  self._crs = src.crs
@@ -820,44 +907,64 @@ class Band(_ImageBandBase):
820
907
  df[column] = f"smallest_{n}"
821
908
  return df
822
909
 
910
+ def clip(
911
+ self, mask: GeoDataFrame | GeoSeries | Polygon | MultiPolygon, **kwargs
912
+ ) -> "Band":
913
+ """Clip band values to geometry mask."""
914
+ values = _clip_xarray(
915
+ self.to_xarray(),
916
+ mask,
917
+ crs=self.crs,
918
+ **kwargs,
919
+ )
920
+ self._bounds = to_bbox(mask)
921
+ self.transform = _get_transform_from_bounds(self._bounds, values.shape)
922
+ self.values = values
923
+ return self
924
+
823
925
  def load(
824
926
  self,
825
927
  bounds: tuple | Geometry | GeoDataFrame | GeoSeries | None = None,
826
928
  indexes: int | tuple[int] | None = None,
827
929
  masked: bool | None = None,
930
+ file_system=None,
828
931
  **kwargs,
829
932
  ) -> "Band":
830
933
  """Load and potentially clip the array.
831
934
 
832
935
  The array is stored in the 'values' property.
833
936
  """
937
+ global _load_counter
938
+ _load_counter += 1
939
+
834
940
  if masked is None:
835
941
  masked = True if self.mask is None else False
836
942
 
837
943
  bounds_was_none = bounds is None
838
944
 
839
- bounds = _get_bounds(bounds, self._bbox)
945
+ bounds = _get_bounds(bounds, self._bbox, self.union_all())
840
946
 
841
947
  should_return_empty: bool = bounds is not None and bounds.area == 0
842
948
  if should_return_empty:
843
949
  self._values = np.array([])
844
950
  if self.mask is not None and not self.is_mask:
845
- self._mask = self._mask.load()
951
+ self._mask = self._mask.load(
952
+ bounds=bounds, indexes=indexes, file_system=file_system
953
+ )
846
954
  self._bounds = None
847
955
  self.transform = None
848
- try:
849
- self._image._mask = self._mask
850
- except AttributeError:
851
- pass
956
+ self.values = self._values
957
+
852
958
  return self
853
959
 
854
960
  if self.has_array and bounds_was_none:
855
961
  return self
856
962
 
857
- # round down/up to integer to avoid precision trouble
858
963
  if bounds is not None:
859
964
  minx, miny, maxx, maxy = to_bbox(bounds)
860
- bounds = (int(minx), int(miny), math.ceil(maxx), math.ceil(maxy))
965
+ ## round down/up to integer to avoid precision trouble
966
+ # bounds = (int(minx), int(miny), math.ceil(maxx), math.ceil(maxy))
967
+ bounds = minx, miny, maxx, maxy
861
968
 
862
969
  if indexes is None:
863
970
  indexes = 1
@@ -868,126 +975,132 @@ class Band(_ImageBandBase):
868
975
  # allow setting a fixed out_shape for the array, in order to make mask same shape as values
869
976
  out_shape = kwargs.pop("out_shape", None)
870
977
 
871
- if self.has_array:
872
- self.values = _clip_loaded_array(
873
- self.values, bounds, self.transform, self.crs, out_shape, **kwargs
978
+ if self.has_array and [int(x) for x in bounds] != [int(x) for x in self.bounds]:
979
+ print(self)
980
+ print(self.mask)
981
+ print(self.mask.values.shape)
982
+ print(self.values.shape)
983
+ print([int(x) for x in bounds], [int(x) for x in self.bounds])
984
+ raise ValueError(
985
+ "Cannot re-load array with different bounds. "
986
+ "Use .copy() to read with different bounds. "
987
+ "Or .clip(mask) to clip."
874
988
  )
875
- self._bounds = bounds
876
- self.transform = _get_transform_from_bounds(self._bounds, self.values.shape)
877
-
878
- else:
879
- with opener(self.path, file_system=self.file_system) as f:
880
- with rasterio.open(f, nodata=self.nodata) as src:
881
- self._res = int(src.res[0]) if not self.res else self.res
882
-
883
- if self.nodata is None or np.isnan(self.nodata):
884
- self.nodata = src.nodata
885
- else:
886
- dtype_min_value = _get_dtype_min(src.dtypes[0])
887
- dtype_max_value = _get_dtype_max(src.dtypes[0])
888
- if (
889
- self.nodata > dtype_max_value
890
- or self.nodata < dtype_min_value
891
- ):
892
- src._dtypes = tuple(
893
- rasterio.dtypes.get_minimum_dtype(self.nodata)
894
- for _ in range(len(_indexes))
895
- )
989
+ # with opener(self.path, file_system=self.file_system) as f:
990
+ with opener(self.path, file_system=file_system) as f:
991
+ with rasterio.open(f, nodata=self.nodata) as src:
992
+ self._res = int(src.res[0]) if not self.res else self.res
896
993
 
897
- if bounds is None:
898
- if self._res != int(src.res[0]):
899
- if out_shape is None:
900
- out_shape = _get_shape_from_bounds(
901
- to_bbox(src.bounds), self.res, indexes
902
- )
903
- self.transform = _get_transform_from_bounds(
904
- to_bbox(src.bounds), shape=out_shape
905
- )
906
- else:
907
- self.transform = src.transform
908
-
909
- self._values = src.read(
910
- indexes=indexes,
911
- out_shape=out_shape,
912
- masked=masked,
913
- **kwargs,
914
- )
915
- else:
916
- window = rasterio.windows.from_bounds(
917
- *bounds, transform=src.transform
994
+ if self.nodata is None or np.isnan(self.nodata):
995
+ self.nodata = src.nodata
996
+ else:
997
+ dtype_min_value = _get_dtype_min(src.dtypes[0])
998
+ dtype_max_value = _get_dtype_max(src.dtypes[0])
999
+ if self.nodata > dtype_max_value or self.nodata < dtype_min_value:
1000
+ src._dtypes = tuple(
1001
+ rasterio.dtypes.get_minimum_dtype(self.nodata)
1002
+ for _ in range(len(_indexes))
918
1003
  )
919
1004
 
1005
+ if bounds is None:
1006
+ if self._res != int(src.res[0]):
920
1007
  if out_shape is None:
921
1008
  out_shape = _get_shape_from_bounds(
922
- bounds, self.res, indexes
1009
+ to_bbox(src.bounds), self.res, indexes
923
1010
  )
924
-
925
- self._values = src.read(
926
- indexes=indexes,
927
- window=window,
928
- boundless=False,
929
- out_shape=out_shape,
930
- masked=masked,
931
- **kwargs,
1011
+ self.transform = _get_transform_from_bounds(
1012
+ to_bbox(src.bounds), shape=out_shape
932
1013
  )
1014
+ else:
1015
+ self.transform = src.transform
933
1016
 
934
- assert out_shape == self._values.shape, (
935
- out_shape,
936
- self._values.shape,
937
- )
1017
+ values = src.read(
1018
+ indexes=indexes,
1019
+ out_shape=out_shape,
1020
+ masked=masked,
1021
+ **kwargs,
1022
+ )
1023
+ else:
1024
+ window = rasterio.windows.from_bounds(
1025
+ *bounds, transform=src.transform
1026
+ )
1027
+
1028
+ if out_shape is None:
1029
+ out_shape = _get_shape_from_bounds(bounds, self.res, indexes)
938
1030
 
1031
+ values = src.read(
1032
+ indexes=indexes,
1033
+ window=window,
1034
+ boundless=False,
1035
+ out_shape=out_shape,
1036
+ masked=masked,
1037
+ **kwargs,
1038
+ )
1039
+
1040
+ assert out_shape == values.shape, (
1041
+ out_shape,
1042
+ values.shape,
1043
+ )
1044
+
1045
+ width, height = values.shape[-2:]
1046
+
1047
+ if width and height:
939
1048
  self.transform = rasterio.transform.from_bounds(
940
- *bounds, self.width, self.height
1049
+ *bounds, width, height
941
1050
  )
942
- self._bounds = bounds
943
1051
 
944
- if self.nodata is not None and not np.isnan(self.nodata):
945
- if isinstance(self.values, np.ma.core.MaskedArray):
946
- self.values.data[self.values.data == src.nodata] = (
947
- self.nodata
948
- )
949
- else:
950
- self.values[self.values == src.nodata] = self.nodata
1052
+ if self.nodata is not None and not np.isnan(self.nodata):
1053
+ if isinstance(values, np.ma.core.MaskedArray):
1054
+ values.data[values.data == src.nodata] = self.nodata
1055
+ else:
1056
+ values[values == src.nodata] = self.nodata
951
1057
 
952
1058
  if self.masking and self.is_mask:
953
- self.values = np.isin(self.values, self.masking["values"])
1059
+ values = np.isin(values, list(self.masking["values"]))
954
1060
 
955
- elif self.mask is not None and not isinstance(
956
- self.values, np.ma.core.MaskedArray
957
- ):
958
- self.mask = self.mask.copy().load(
959
- bounds=bounds, indexes=indexes, out_shape=out_shape, **kwargs
960
- )
1061
+ elif self.mask is not None and not isinstance(values, np.ma.core.MaskedArray):
1062
+
1063
+ if not self.mask.has_array:
1064
+ self._mask = self.mask.load(
1065
+ bounds=bounds, indexes=indexes, out_shape=out_shape, **kwargs
1066
+ )
961
1067
  mask_arr = self.mask.values
962
1068
 
963
- self._values = np.ma.array(
964
- self._values, mask=mask_arr, fill_value=self.nodata
965
- )
1069
+ values = np.ma.array(values, mask=mask_arr, fill_value=self.nodata)
966
1070
 
967
- try:
968
- self._image._mask = self._mask
969
- except AttributeError:
970
- pass
1071
+ if bounds is not None:
1072
+ self._bounds = to_bbox(bounds)
1073
+
1074
+ self._values = values
1075
+ # trigger the setter
1076
+ self.values = values
971
1077
 
972
1078
  return self
973
1079
 
974
1080
  @property
975
1081
  def is_mask(self) -> bool:
976
1082
  """True if the band_id is equal to the masking band_id."""
1083
+ if self.masking is None:
1084
+ return False
977
1085
  return self.band_id == self.masking["band_id"]
978
1086
 
979
1087
  @property
980
1088
  def has_array(self) -> bool:
981
1089
  """Whether the array is loaded."""
982
1090
  try:
983
- if not isinstance(self.values, np.ndarray):
1091
+ if not isinstance(self.values, (np.ndarray | DataArray)):
984
1092
  raise ValueError()
985
1093
  return True
986
1094
  except ValueError: # also catches ArrayNotLoadedError
987
1095
  return False
988
1096
 
989
1097
  def write(
990
- self, path: str | Path, driver: str = "GTiff", compress: str = "LZW", **kwargs
1098
+ self,
1099
+ path: str | Path,
1100
+ driver: str = "GTiff",
1101
+ compress: str = "LZW",
1102
+ file_system=None,
1103
+ **kwargs,
991
1104
  ) -> None:
992
1105
  """Write the array as an image file."""
993
1106
  if not hasattr(self, "_values"):
@@ -1010,7 +1123,8 @@ class Band(_ImageBandBase):
1010
1123
  "width": self.width,
1011
1124
  } | kwargs
1012
1125
 
1013
- with opener(path, "wb", file_system=self.file_system) as f:
1126
+ # with opener(path, "wb", file_system=self.file_system) as f:
1127
+ with opener(path, "wb", file_system=file_system) as f:
1014
1128
  with rasterio.open(f, "w", **profile) as dst:
1015
1129
 
1016
1130
  if dst.nodata is None:
@@ -1032,17 +1146,14 @@ class Band(_ImageBandBase):
1032
1146
  if isinstance(self.values, np.ma.core.MaskedArray):
1033
1147
  dst.write_mask(self.values.mask)
1034
1148
 
1035
- self._path = str(path)
1149
+ self._path = _fix_path(str(path))
1036
1150
 
1037
1151
  def apply(self, func: Callable, **kwargs) -> "Band":
1038
- """Apply a function to the array."""
1039
- self.values = func(self.values, **kwargs)
1040
- return self
1041
-
1042
- def normalize(self) -> "Band":
1043
- """Normalize array values between 0 and 1."""
1044
- arr = self.values
1045
- self.values = (arr - np.min(arr)) / (np.max(arr) - np.min(arr))
1152
+ """Apply a function to the Band."""
1153
+ results = func(self, **kwargs)
1154
+ if isinstance(results, Band):
1155
+ return results
1156
+ self.values = results
1046
1157
  return self
1047
1158
 
1048
1159
  def sample(self, size: int = 1000, mask: Any = None, **kwargs) -> "Image":
@@ -1200,23 +1311,43 @@ class Band(_ImageBandBase):
1200
1311
  )
1201
1312
 
1202
1313
  def to_xarray(self) -> DataArray:
1203
- """Convert the raster to an xarray.DataArray."""
1204
- name = self.name or self.__class__.__name__.lower()
1205
- coords = _generate_spatial_coords(self.transform, self.width, self.height)
1206
- if len(self.values.shape) == 2:
1207
- dims = ["y", "x"]
1208
- elif len(self.values.shape) == 3:
1209
- dims = ["band", "y", "x"]
1210
- else:
1211
- raise ValueError("Array must be 2 or 3 dimensional.")
1212
- return xr.DataArray(
1314
+ """Convert the raster to an xarray.DataArray."""
1315
+ if self.backend == "xarray":
1316
+ return self.values
1317
+ return self._to_xarray(
1213
1318
  self.values,
1214
- coords=coords,
1215
- dims=dims,
1216
- name=name,
1217
- attrs={"crs": self.crs},
1319
+ transform=self.transform,
1320
+ # name=self.name or self.__class__.__name__.lower(),
1218
1321
  )
1219
1322
 
1323
+ def to_numpy(self) -> np.ndarray | np.ma.core.MaskedArray:
1324
+ """Convert the raster to a numpy.ndarray."""
1325
+ return self._to_numpy(self.values).copy()
1326
+
1327
+ def _to_numpy(
1328
+ self, arr: np.ndarray | DataArray, masked: bool = True
1329
+ ) -> np.ndarray | np.ma.core.MaskedArray:
1330
+ if not isinstance(arr, np.ndarray):
1331
+ if masked:
1332
+ try:
1333
+ mask_arr = arr.isnull().values
1334
+ except AttributeError:
1335
+ mask_arr = np.full(arr.shape, False)
1336
+ try:
1337
+ arr = arr.to_numpy()
1338
+ except AttributeError:
1339
+ arr = arr.values
1340
+ if not isinstance(arr, np.ndarray):
1341
+ arr = np.array(arr)
1342
+ if (
1343
+ masked
1344
+ and self.mask is not None
1345
+ and not self.is_mask
1346
+ and not isinstance(arr, np.ma.core.MaskedArray)
1347
+ ):
1348
+ arr = np.ma.array(arr, mask=mask_arr, fill_value=self.nodata)
1349
+ return arr
1350
+
1220
1351
  def __repr__(self) -> str:
1221
1352
  """String representation."""
1222
1353
  try:
@@ -1252,12 +1383,12 @@ class Image(_ImageBandBase):
1252
1383
  """Image consisting of one or more Bands."""
1253
1384
 
1254
1385
  band_class: ClassVar[Band] = Band
1386
+ backend: str = "numpy"
1255
1387
 
1256
1388
  def __init__(
1257
1389
  self,
1258
- data: str | Path | Sequence[Band],
1390
+ data: str | Path | Sequence[Band] | None = None,
1259
1391
  res: int | None = None,
1260
- file_system: GCSFileSystem | None = None,
1261
1392
  processes: int = 1,
1262
1393
  df: pd.DataFrame | None = None,
1263
1394
  nodata: int | None = None,
@@ -1265,44 +1396,38 @@ class Image(_ImageBandBase):
1265
1396
  **kwargs,
1266
1397
  ) -> None:
1267
1398
  """Image initialiser."""
1399
+ if data is None:
1400
+ # allowing 'bands' to replace 'data' as argument
1401
+ # to make the print repr. valid as initialiser
1402
+ if "bands" not in kwargs:
1403
+ raise TypeError("Must specify either 'data' or 'bands'.")
1404
+ data = kwargs.pop("bands")
1405
+
1268
1406
  super().__init__(**kwargs)
1269
1407
 
1270
1408
  self.nodata = nodata
1271
- self._res = res
1272
- self._crs = None
1273
- self.file_system = file_system
1274
1409
  self.processes = processes
1410
+ self._crs = None
1411
+ self._bands = None
1275
1412
 
1276
1413
  if hasattr(data, "__iter__") and all(isinstance(x, Band) for x in data):
1277
- self._bands = list(data)
1278
- if res is None:
1279
- res = list({band.res for band in self.bands})
1280
- if len(res) == 1:
1281
- self._res = res[0]
1282
- else:
1283
- raise ValueError(f"Different resolutions for the bands: {res}")
1284
- else:
1285
- self._res = res
1414
+ self._construct_image_from_bands(data, res)
1286
1415
  return
1287
-
1288
- if not isinstance(data, (str | Path | os.PathLike)):
1416
+ elif not isinstance(data, (str | Path | os.PathLike)):
1289
1417
  raise TypeError("'data' must be string, Path-like or a sequence of Band.")
1290
1418
 
1291
- self._bands = None
1292
- self._path = _fix_path(data) # str(data).rstrip("/").rstrip(r"\"")
1419
+ self._res = res
1420
+ self._path = _fix_path(data)
1293
1421
 
1294
1422
  if all_file_paths is None and self.path:
1295
1423
  self._all_file_paths = _get_all_file_paths(self.path)
1296
1424
  elif self.path:
1297
- self._all_file_paths = [
1298
- x for x in all_file_paths if self.path in _fix_path(x)
1299
- ]
1425
+ all_file_paths = {_fix_path(x) for x in all_file_paths}
1426
+ self._all_file_paths = {x for x in all_file_paths if self.path in x}
1300
1427
  else:
1301
1428
  self._all_file_paths = None
1302
1429
 
1303
1430
  if df is None:
1304
- # file_paths = _get_all_file_paths(self.path)
1305
-
1306
1431
  if not self._all_file_paths:
1307
1432
  self._all_file_paths = [self.path]
1308
1433
  df = self._create_metadata_df(self._all_file_paths)
@@ -1311,7 +1436,7 @@ class Image(_ImageBandBase):
1311
1436
 
1312
1437
  cols_to_explode = [
1313
1438
  "file_path",
1314
- "filename",
1439
+ "file_name",
1315
1440
  *[x for x in df if FILENAME_COL_SUFFIX in x],
1316
1441
  ]
1317
1442
  try:
@@ -1319,34 +1444,82 @@ class Image(_ImageBandBase):
1319
1444
  except ValueError:
1320
1445
  for col in cols_to_explode:
1321
1446
  df = df.explode(col)
1322
- df = df.loc[lambda x: ~x["filename"].duplicated()].reset_index(drop=True)
1447
+ df = df.loc[lambda x: ~x["file_name"].duplicated()].reset_index(drop=True)
1323
1448
 
1324
- df = df.loc[lambda x: x["image_path"] == _fix_path(self.path)]
1449
+ df = df.loc[lambda x: x["image_path"] == self.path]
1325
1450
 
1326
1451
  self._df = df
1327
1452
 
1453
+ if self.path is not None and self.metadata:
1454
+ self.metadata = {
1455
+ key: value for key, value in self.metadata.items() if self.path in key
1456
+ }
1457
+
1458
+ if self.metadata:
1459
+ try:
1460
+ metadata = self.metadata[self.path]
1461
+ except KeyError:
1462
+ metadata = {}
1463
+ for key, value in metadata.items():
1464
+ if key in dir(self):
1465
+ setattr(self, f"_{key}", value)
1466
+ else:
1467
+ setattr(self, key, value)
1468
+
1469
+ else:
1470
+ for key, value in self._get_metadata_attributes(
1471
+ self.metadata_attributes
1472
+ ).items():
1473
+ setattr(self, key, value)
1474
+
1475
+ def _construct_image_from_bands(
1476
+ self, data: Sequence[Band], res: int | None
1477
+ ) -> None:
1478
+ self._bands = list(data)
1479
+ if res is None:
1480
+ res = list({band.res for band in self.bands})
1481
+ if len(res) == 1:
1482
+ self._res = res[0]
1483
+ else:
1484
+ raise ValueError(f"Different resolutions for the bands: {res}")
1485
+ else:
1486
+ self._res = res
1328
1487
  for key in self.metadata_attributes:
1329
- setattr(self, key, None)
1488
+ band_values = {getattr(band, key) for band in self if hasattr(band, key)}
1489
+ band_values = {x for x in band_values if x is not None}
1490
+ if len(band_values) > 1:
1491
+ raise ValueError(f"Different {key} values in bands: {band_values}")
1492
+ elif len(band_values):
1493
+ try:
1494
+ setattr(self, key, next(iter(band_values)))
1495
+ except AttributeError:
1496
+ setattr(self, f"_{key}", next(iter(band_values)))
1330
1497
 
1331
- if self.metadata_attributes:
1332
- self._add_metadata_attributes()
1498
+ def copy(self) -> "Image":
1499
+ """Copy the instance and its attributes."""
1500
+ copied = super().copy()
1501
+ for band in copied:
1502
+ band._mask = copied._mask
1503
+ return copied
1333
1504
 
1334
- @property
1335
- def values(self) -> np.ndarray:
1336
- """3 dimensional numpy array."""
1337
- values = [band.values for band in self]
1338
- if self.mask is not None:
1339
- mask = [band.mask.values for band in self]
1340
- return np.ma.array(values, mask=mask, fill_value=self.nodata)
1341
- return np.array(values)
1342
-
1343
- def ndvi(self, red_band: str, nir_band: str, copy: bool = True) -> NDVIBand:
1505
+ def apply(self, func: Callable, **kwargs) -> "Image":
1506
+ """Apply a function to each band of the Image."""
1507
+ with joblib.Parallel(n_jobs=self.processes, backend="loky") as parallel:
1508
+ parallel(joblib.delayed(_band_apply)(band, func, **kwargs) for band in self)
1509
+
1510
+ return self
1511
+
1512
+ def ndvi(
1513
+ self, red_band: str, nir_band: str, padding: int = 0, copy: bool = True
1514
+ ) -> NDVIBand:
1344
1515
  """Calculate the NDVI for the Image."""
1345
1516
  copied = self.copy() if copy else self
1346
1517
  red = copied[red_band].load()
1347
1518
  nir = copied[nir_band].load()
1348
1519
 
1349
- arr: np.ndarray | np.ma.core.MaskedArray = ndvi(red.values, nir.values)
1520
+ arr: np.ndarray | np.ma.core.MaskedArray = ndvi(
1521
+ red.values, nir.values, padding=padding
1522
+ )
1350
1523
 
1351
1524
  return NDVIBand(
1352
1525
  arr,
@@ -1390,56 +1563,61 @@ class Image(_ImageBandBase):
1390
1563
 
1391
1564
  def to_xarray(self) -> DataArray:
1392
1565
  """Convert the raster to an xarray.DataArray."""
1393
- name = self.name or self.__class__.__name__.lower()
1394
- coords = _generate_spatial_coords(
1395
- self[0].transform, self[0].width, self[0].height
1396
- )
1397
- dims = ["band", "y", "x"]
1398
- return xr.DataArray(
1399
- self.values,
1400
- coords=coords,
1401
- dims=dims,
1402
- name=name,
1403
- attrs={"crs": self.crs},
1566
+ if self.backend == "xarray":
1567
+ return self.values
1568
+
1569
+ return self._to_xarray(
1570
+ np.array([band.values for band in self]),
1571
+ transform=self[0].transform,
1404
1572
  )
1405
1573
 
1406
1574
  @property
1407
1575
  def mask(self) -> Band | None:
1408
1576
  """Mask Band."""
1409
- if self._mask is not None:
1410
- # if not self._mask.has_array:
1411
- # try:
1412
- # self._mask.values = self[0]._mask.values
1413
- # except Exception:
1414
- # pass
1415
- return self._mask
1416
1577
  if self.masking is None:
1417
1578
  return None
1418
1579
 
1580
+ elif self._mask is not None:
1581
+ return self._mask
1582
+
1583
+ elif self._bands is not None and all(band.mask is not None for band in self):
1584
+ if len({id(band.mask) for band in self}) > 1:
1585
+ raise ValueError(
1586
+ "Image bands must have same mask.",
1587
+ {id(band.mask) for band in self},
1588
+ ) # TODO
1589
+ self._mask = next(
1590
+ iter([band.mask for band in self if band.mask is not None])
1591
+ )
1592
+ return self._mask
1593
+
1419
1594
  mask_band_id = self.masking["band_id"]
1420
- mask_paths = [path for path in self._df["file_path"] if mask_band_id in path]
1595
+ mask_paths = [path for path in self._all_file_paths if mask_band_id in path]
1421
1596
  if len(mask_paths) > 1:
1422
1597
  raise ValueError(
1423
1598
  f"Multiple file_paths match mask band_id {mask_band_id} for {self.path}"
1424
1599
  )
1425
1600
  elif not mask_paths:
1426
1601
  raise ValueError(
1427
- f"No file_paths match mask band_id {mask_band_id} for {self.path}"
1602
+ f"No file_paths match mask band_id {mask_band_id} for {self.path} among "
1603
+ + str([Path(x).name for x in _ls_func(self.path)])
1428
1604
  )
1605
+
1429
1606
  self._mask = self.band_class(
1430
1607
  mask_paths[0],
1431
- _add_metadata_attributes=False,
1432
1608
  **self._common_init_kwargs,
1433
1609
  )
1434
-
1610
+ if self._bands is not None:
1611
+ for band in self:
1612
+ band._mask = self._mask
1435
1613
  return self._mask
1436
1614
 
1437
1615
  @mask.setter
1438
- def mask(self, values: Band) -> None:
1616
+ def mask(self, values: Band | None) -> None:
1439
1617
  if values is None:
1440
1618
  self._mask = None
1441
1619
  for band in self:
1442
- band.mask = None
1620
+ band._mask = None
1443
1621
  return
1444
1622
  if not isinstance(values, Band):
1445
1623
  raise TypeError(f"mask must be Band. Got {type(values)}")
@@ -1449,7 +1627,7 @@ class Image(_ImageBandBase):
1449
1627
  band._mask = self._mask
1450
1628
  try:
1451
1629
  band.values = np.ma.array(
1452
- band.values, mask=mask_arr, fill_value=band.nodata
1630
+ band.values.data, mask=mask_arr, fill_value=band.nodata
1453
1631
  )
1454
1632
  except ArrayNotLoadedError:
1455
1633
  pass
@@ -1470,22 +1648,24 @@ class Image(_ImageBandBase):
1470
1648
  if self._bands is not None:
1471
1649
  return self._bands
1472
1650
 
1651
+ if self.masking:
1652
+ mask_band_id = self.masking["band_id"]
1653
+ paths = [path for path in self._df["file_path"] if mask_band_id not in path]
1654
+ else:
1655
+ paths = self._df["file_path"]
1656
+
1657
+ mask = self.mask
1658
+
1473
1659
  self._bands = [
1474
1660
  self.band_class(
1475
1661
  path,
1476
- mask=self.mask,
1477
- _add_metadata_attributes=False,
1662
+ mask=mask,
1663
+ all_file_paths=self._all_file_paths,
1478
1664
  **self._common_init_kwargs,
1479
1665
  )
1480
- for path in (self._df["file_path"])
1666
+ for path in paths
1481
1667
  ]
1482
1668
 
1483
- if self.masking:
1484
- mask_band_id = self.masking["band_id"]
1485
- self._bands = [
1486
- band for band in self._bands if mask_band_id not in band.path
1487
- ]
1488
-
1489
1669
  if (
1490
1670
  self.filename_patterns
1491
1671
  and any(_get_non_optional_groups(pat) for pat in self.filename_patterns)
@@ -1514,30 +1694,19 @@ class Image(_ImageBandBase):
1514
1694
  if self._should_be_sorted:
1515
1695
  self._bands = list(sorted(self._bands))
1516
1696
 
1517
- for key in self.metadata_attributes:
1518
- for band in self:
1519
- value = getattr(self, key)
1520
- if value is None:
1521
- continue
1522
- if isinstance(value, BandIdDict):
1523
- try:
1524
- value = value[band.band_id]
1525
- except KeyError:
1526
- continue
1527
- setattr(band, key, value)
1528
-
1529
- for band in self:
1530
- band._image = self
1531
-
1532
1697
  return self._bands
1533
1698
 
1534
1699
  @property
1535
1700
  def _should_be_sorted(self) -> bool:
1536
1701
  sort_groups = ["band", "band_id"]
1537
- return self.filename_patterns and any(
1538
- group in _get_non_optional_groups(pat)
1539
- for group in sort_groups
1540
- for pat in self.filename_patterns
1702
+ return (
1703
+ self.filename_patterns
1704
+ and any(
1705
+ group in _get_non_optional_groups(pat)
1706
+ for group in sort_groups
1707
+ for pat in self.filename_patterns
1708
+ )
1709
+ or all(band.band_id is not None for band in self)
1541
1710
  )
1542
1711
 
1543
1712
  @property
@@ -1613,7 +1782,7 @@ class Image(_ImageBandBase):
1613
1782
  if isinstance(band, str):
1614
1783
  return self._get_band(band)
1615
1784
  if isinstance(band, int):
1616
- return self.bands[band] # .copy()
1785
+ return self.bands[band]
1617
1786
 
1618
1787
  copied = self.copy()
1619
1788
  try:
@@ -1639,10 +1808,7 @@ class Image(_ImageBandBase):
1639
1808
  try:
1640
1809
  return self.date < other.date
1641
1810
  except Exception as e:
1642
- print(self.path)
1643
- print(self.date)
1644
- print(other.path)
1645
- print(other.date)
1811
+ print("", self.path, self.date, other.path, other.date, sep="\n")
1646
1812
  raise e
1647
1813
 
1648
1814
  def __iter__(self) -> Iterator[Band]:
@@ -1702,36 +1868,36 @@ class ImageCollection(_ImageBase):
1702
1868
  image_class: ClassVar[Image] = Image
1703
1869
  band_class: ClassVar[Band] = Band
1704
1870
  _metadata_attribute_collection_type: ClassVar[type] = pd.Series
1871
+ backend: str = "numpy"
1705
1872
 
1706
1873
  def __init__(
1707
1874
  self,
1708
1875
  data: str | Path | Sequence[Image] | Sequence[str | Path],
1709
1876
  res: int,
1710
- level: str | None = NoLevel,
1877
+ level: str | None = None_,
1711
1878
  processes: int = 1,
1712
- file_system: GCSFileSystem | None = None,
1713
1879
  metadata: str | dict | pd.DataFrame | None = None,
1714
1880
  nodata: int | None = None,
1715
1881
  **kwargs,
1716
1882
  ) -> None:
1717
1883
  """Initialiser."""
1718
- super().__init__(**kwargs)
1884
+ if data is not None and kwargs.get("root"):
1885
+ root = _fix_path(kwargs.pop("root"))
1886
+ data = [f"{root}/{name}" for name in data]
1887
+ _from_root = True
1888
+ else:
1889
+ _from_root = False
1890
+
1891
+ super().__init__(metadata=metadata, **kwargs)
1892
+
1893
+ if callable(level) and isinstance(level(), None_):
1894
+ level = None
1719
1895
 
1720
1896
  self.nodata = nodata
1721
- self.level = level if not isinstance(level, NoLevel) else None
1897
+ self.level = level
1722
1898
  self.processes = processes
1723
- self.file_system = file_system
1724
1899
  self._res = res
1725
- self._band_ids = None
1726
- self._crs = None # crs
1727
-
1728
- if metadata is not None:
1729
- if isinstance(metadata, (str | Path | os.PathLike)):
1730
- self.metadata = _read_parquet_func(metadata)
1731
- else:
1732
- self.metadata = metadata
1733
- else:
1734
- self.metadata = metadata
1900
+ self._crs = None
1735
1901
 
1736
1902
  self._df = None
1737
1903
  self._all_file_paths = None
@@ -1743,18 +1909,22 @@ class ImageCollection(_ImageBase):
1743
1909
  self.images = [x.copy() for x in data]
1744
1910
  return
1745
1911
  elif all(isinstance(x, (str | Path | os.PathLike)) for x in data):
1746
- self._all_file_paths = list(
1747
- itertools.chain.from_iterable(
1748
- _get_all_file_paths(str(path)) for path in data
1749
- )
1750
- )
1751
- self._df = self._create_metadata_df([str(x) for x in data])
1912
+ # adding band paths (asuming 'data' is a sequence of image paths)
1913
+ try:
1914
+ self._all_file_paths = _get_child_paths_threaded(data) | set(data)
1915
+ except FileNotFoundError as e:
1916
+ if _from_root:
1917
+ raise TypeError(
1918
+ "When passing 'root', 'data' must be a sequence of image names that have 'root' as parent path."
1919
+ ) from e
1920
+ raise e
1921
+ self._df = self._create_metadata_df(self._all_file_paths)
1752
1922
  return
1753
1923
 
1754
1924
  if not isinstance(data, (str | Path | os.PathLike)):
1755
1925
  raise TypeError("'data' must be string, Path-like or a sequence of Image.")
1756
1926
 
1757
- self._path = str(data)
1927
+ self._path = _fix_path(str(data))
1758
1928
 
1759
1929
  self._all_file_paths = _get_all_file_paths(self.path)
1760
1930
 
@@ -1765,18 +1935,6 @@ class ImageCollection(_ImageBase):
1765
1935
 
1766
1936
  self._df = self._create_metadata_df(self._all_file_paths)
1767
1937
 
1768
- @property
1769
- def values(self) -> np.ndarray:
1770
- """4 dimensional numpy array."""
1771
- if isinstance(self[0].values, np.ma.core.MaskedArray):
1772
- return np.ma.array([img.values for img in self])
1773
- return np.array([img.values for img in self])
1774
-
1775
- @property
1776
- def mask(self) -> np.ndarray:
1777
- """4 dimensional numpy array."""
1778
- return np.array([img.mask.values for img in self])
1779
-
1780
1938
  def groupby(self, by: str | list[str], **kwargs) -> ImageCollectionGroupBy:
1781
1939
  """Group the Collection by Image or Band attribute(s)."""
1782
1940
  df = pd.DataFrame(
@@ -1830,15 +1988,20 @@ class ImageCollection(_ImageBase):
1830
1988
  for img in copied:
1831
1989
  assert len(img) == 1
1832
1990
  try:
1833
- img._path = img[0].path
1991
+ img._path = _fix_path(img[0].path)
1834
1992
  except PathlessImageError:
1835
1993
  pass
1836
1994
  return copied
1837
1995
 
1838
1996
  def apply(self, func: Callable, **kwargs) -> "ImageCollection":
1839
1997
  """Apply a function to all bands in each image of the collection."""
1840
- for img in self:
1841
- img._bands = [func(band, **kwargs) for band in img]
1998
+ with joblib.Parallel(n_jobs=self.processes, backend="loky") as parallel:
1999
+ parallel(
2000
+ joblib.delayed(_band_apply)(band, func, **kwargs)
2001
+ for img in self
2002
+ for band in img
2003
+ )
2004
+
1842
2005
  return self
1843
2006
 
1844
2007
  def get_unique_band_ids(self) -> list[str]:
@@ -1851,7 +2014,7 @@ class ImageCollection(_ImageBase):
1851
2014
  date_ranges: DATE_RANGES_TYPE = None,
1852
2015
  bbox: GeoDataFrame | GeoSeries | Geometry | tuple[float] | None = None,
1853
2016
  intersects: GeoDataFrame | GeoSeries | Geometry | tuple[float] | None = None,
1854
- max_cloud_coverage: int | None = None,
2017
+ max_cloud_cover: int | None = None,
1855
2018
  copy: bool = True,
1856
2019
  ) -> "ImageCollection":
1857
2020
  """Filter images and bands in the collection."""
@@ -1860,11 +2023,11 @@ class ImageCollection(_ImageBase):
1860
2023
  if date_ranges:
1861
2024
  copied = copied._filter_dates(date_ranges)
1862
2025
 
1863
- if max_cloud_coverage is not None:
2026
+ if max_cloud_cover is not None:
1864
2027
  copied.images = [
1865
2028
  image
1866
2029
  for image in copied.images
1867
- if image.cloud_coverage_percentage < max_cloud_coverage
2030
+ if image.cloud_cover_percentage < max_cloud_cover
1868
2031
  ]
1869
2032
 
1870
2033
  if bbox is not None:
@@ -1878,7 +2041,6 @@ class ImageCollection(_ImageBase):
1878
2041
  if isinstance(bands, str):
1879
2042
  bands = [bands]
1880
2043
  bands = set(bands)
1881
- copied._band_ids = bands
1882
2044
  copied.images = [img[bands] for img in copied.images if bands in img]
1883
2045
 
1884
2046
  return copied
@@ -1892,7 +2054,7 @@ class ImageCollection(_ImageBase):
1892
2054
  **kwargs,
1893
2055
  ) -> Band:
1894
2056
  """Merge all areas and all bands to a single Band."""
1895
- bounds = _get_bounds(bounds, self._bbox)
2057
+ bounds = _get_bounds(bounds, self._bbox, self.union_all())
1896
2058
  if bounds is not None:
1897
2059
  bounds = to_bbox(bounds)
1898
2060
 
@@ -1930,14 +2092,14 @@ class ImageCollection(_ImageBase):
1930
2092
  **kwargs,
1931
2093
  )
1932
2094
 
1933
- if isinstance(indexes, int) and len(arr.shape) == 3 and arr.shape[0] == 1:
1934
- arr = arr[0]
2095
+ if isinstance(indexes, int) and len(arr.shape) == 3 and arr.shape[0] == 1:
2096
+ arr = arr[0]
1935
2097
 
1936
- if method == "mean":
1937
- if as_int:
1938
- arr = arr // len(datasets)
1939
- else:
1940
- arr = arr / len(datasets)
2098
+ if method == "mean":
2099
+ if as_int:
2100
+ arr = arr // len(datasets)
2101
+ else:
2102
+ arr = arr / len(datasets)
1941
2103
 
1942
2104
  if bounds is None:
1943
2105
  bounds = self.bounds
@@ -1963,7 +2125,7 @@ class ImageCollection(_ImageBase):
1963
2125
  **kwargs,
1964
2126
  ) -> Image:
1965
2127
  """Merge all areas to a single tile, one band per band_id."""
1966
- bounds = _get_bounds(bounds, self._bbox)
2128
+ bounds = _get_bounds(bounds, self._bbox, self.union_all())
1967
2129
  if bounds is not None:
1968
2130
  bounds = to_bbox(bounds)
1969
2131
  bounds = self.bounds if bounds is None else bounds
@@ -2021,7 +2183,6 @@ class ImageCollection(_ImageBase):
2021
2183
  bounds=out_bounds,
2022
2184
  crs=crs,
2023
2185
  band_id=band_id,
2024
- _add_metadata_attributes=False,
2025
2186
  **self._common_init_kwargs,
2026
2187
  )
2027
2188
  )
@@ -2061,10 +2222,13 @@ class ImageCollection(_ImageBase):
2061
2222
  arr = np.array(
2062
2223
  [
2063
2224
  (
2064
- band.load(
2065
- bounds=(_bounds if _bounds is not None else None),
2066
- **kwargs,
2067
- )
2225
+ # band.load(
2226
+ # bounds=(_bounds if _bounds is not None else None),
2227
+ # **kwargs,
2228
+ # )
2229
+ # if not band.has_array
2230
+ # else
2231
+ band
2068
2232
  ).values
2069
2233
  for img in collection
2070
2234
  for band in img
@@ -2087,7 +2251,7 @@ class ImageCollection(_ImageBase):
2087
2251
  coords = _generate_spatial_coords(transform, width, height)
2088
2252
 
2089
2253
  arrs.append(
2090
- xr.DataArray(
2254
+ DataArray(
2091
2255
  arr,
2092
2256
  coords=coords,
2093
2257
  dims=["y", "x"],
@@ -2104,7 +2268,7 @@ class ImageCollection(_ImageBase):
2104
2268
  return merged.to_numpy()
2105
2269
 
2106
2270
  def sort_images(self, ascending: bool = True) -> "ImageCollection":
2107
- """Sort Images by date."""
2271
+ """Sort Images by date, then file path if date attribute is missing."""
2108
2272
  self._images = (
2109
2273
  list(sorted([img for img in self if img.date is not None]))
2110
2274
  + sorted(
@@ -2121,6 +2285,7 @@ class ImageCollection(_ImageBase):
2121
2285
  self,
2122
2286
  bounds: tuple | Geometry | GeoDataFrame | GeoSeries | None = None,
2123
2287
  indexes: int | tuple[int] | None = None,
2288
+ file_system=None,
2124
2289
  **kwargs,
2125
2290
  ) -> "ImageCollection":
2126
2291
  """Load all image Bands with threading."""
@@ -2130,10 +2295,46 @@ class ImageCollection(_ImageBase):
2130
2295
  and all(band.has_array for img in self for band in img)
2131
2296
  ):
2132
2297
  return self
2298
+
2299
+ # if self.processes == 1:
2300
+ # for img in self:
2301
+ # for band in img:
2302
+ # band.load(
2303
+ # bounds=bounds,
2304
+ # indexes=indexes,
2305
+ # file_system=file_system,
2306
+ # **kwargs,
2307
+ # )
2308
+ # return self
2309
+
2133
2310
  with joblib.Parallel(n_jobs=self.processes, backend="threading") as parallel:
2311
+ if self.masking:
2312
+ parallel(
2313
+ joblib.delayed(_load_band)(
2314
+ img.mask,
2315
+ bounds=bounds,
2316
+ indexes=indexes,
2317
+ file_system=file_system,
2318
+ **kwargs,
2319
+ )
2320
+ for img in self
2321
+ )
2322
+ for img in self:
2323
+ for band in img:
2324
+ band._mask = img.mask
2325
+
2326
+ # print({img.mask.has_array for img in self })
2327
+ # print({band.mask.has_array for img in self for band in img})
2328
+
2329
+ # with joblib.Parallel(n_jobs=self.processes, backend="threading") as parallel:
2330
+
2134
2331
  parallel(
2135
2332
  joblib.delayed(_load_band)(
2136
- band, bounds=bounds, indexes=indexes, **kwargs
2333
+ band,
2334
+ bounds=bounds,
2335
+ indexes=indexes,
2336
+ file_system=file_system,
2337
+ **kwargs,
2137
2338
  )
2138
2339
  for img in self
2139
2340
  for band in img
@@ -2141,6 +2342,27 @@ class ImageCollection(_ImageBase):
2141
2342
 
2142
2343
  return self
2143
2344
 
2345
+ def clip(
2346
+ self,
2347
+ mask: Geometry | GeoDataFrame | GeoSeries,
2348
+ **kwargs,
2349
+ ) -> "ImageCollection":
2350
+ """Clip all image Bands with 'loky'."""
2351
+ if self.processes == 1:
2352
+ for img in self:
2353
+ for band in img:
2354
+ band.clip(mask, **kwargs)
2355
+ return self
2356
+
2357
+ with joblib.Parallel(n_jobs=self.processes, backend="loky") as parallel:
2358
+ parallel(
2359
+ joblib.delayed(_clip_band)(band, mask, **kwargs)
2360
+ for img in self
2361
+ for band in img
2362
+ )
2363
+
2364
+ return self
2365
+
2144
2366
  def _set_bbox(
2145
2367
  self, bbox: GeoDataFrame | GeoSeries | Geometry | tuple[float]
2146
2368
  ) -> "ImageCollection":
@@ -2150,12 +2372,17 @@ class ImageCollection(_ImageBase):
2150
2372
  if self._images is not None:
2151
2373
  for img in self._images:
2152
2374
  img._bbox = self._bbox
2375
+ if img.mask is not None:
2376
+ img.mask._bbox = self._bbox
2153
2377
  if img.bands is None:
2154
2378
  continue
2155
2379
  for band in img:
2156
2380
  band._bbox = self._bbox
2157
2381
  bounds = box(*band._bbox).intersection(box(*band.bounds))
2158
2382
  band._bounds = to_bbox(bounds) if not bounds.is_empty else None
2383
+ if band.mask is not None:
2384
+ band.mask._bbox = self._bbox
2385
+ band.mask._bounds = band._bounds
2159
2386
 
2160
2387
  return self
2161
2388
 
@@ -2184,11 +2411,15 @@ class ImageCollection(_ImageBase):
2184
2411
 
2185
2412
  other = to_shapely(other)
2186
2413
 
2187
- # intersects_list = GeoSeries([img.union_all() for img in self]).intersects(other)
2188
- with joblib.Parallel(n_jobs=self.processes, backend="loky") as parallel:
2189
- intersects_list: list[bool] = parallel(
2190
- joblib.delayed(_intesects)(image, other) for image in self
2191
- )
2414
+ if self.processes == 1:
2415
+ intersects_list: pd.Series = GeoSeries(
2416
+ [img.union_all() for img in self]
2417
+ ).intersects(other)
2418
+ else:
2419
+ with joblib.Parallel(n_jobs=self.processes, backend="loky") as parallel:
2420
+ intersects_list: list[bool] = parallel(
2421
+ joblib.delayed(_intesects)(image, other) for image in self
2422
+ )
2192
2423
 
2193
2424
  self.images = [
2194
2425
  image
@@ -2197,37 +2428,68 @@ class ImageCollection(_ImageBase):
2197
2428
  ]
2198
2429
  return self
2199
2430
 
2200
- def to_xarray(self, **kwargs) -> DataArray:
2201
- """Convert the raster to an xarray.DataArray."""
2202
- # arrs = []
2203
- # for img in self:
2204
- # for band in img:
2205
- # arr = band.load(**kwargs).values
2206
- # arrs.append(arr)
2207
-
2208
- # n_images = len(self)
2209
- # n_bands = len(img)
2210
- # height, width = arr.shape
2211
-
2212
- # arr_4d = np.array(arrs).reshape(n_images, n_bands, height, width)
2431
+ def to_xarray(
2432
+ self,
2433
+ **kwargs,
2434
+ ) -> Dataset:
2435
+ """Convert the raster to an xarray.Dataset.
2213
2436
 
2214
- try:
2215
- name = Path(self.path).stem
2216
- except TypeError:
2217
- name = self.__class__.__name__.lower()
2437
+ Images are converted to 2d arrays for each unique bounds.
2438
+ The spatial dimensions will be labeled "x" and "y". The third
2439
+ dimension defaults to "date" if all images have date attributes.
2440
+ Otherwise defaults to the image name.
2441
+ """
2442
+ if any(not band.has_array for img in self for band in img):
2443
+ raise ValueError("Arrays must be loaded.")
2444
+
2445
+ # if by is None:
2446
+ if all(img.date for img in self):
2447
+ by = ["date"]
2448
+ elif not pd.Index([img.name for img in self]).is_unique:
2449
+ raise ValueError("Images must have unique names.")
2450
+ else:
2451
+ by = ["name"]
2452
+ # elif isinstance(by, str):
2453
+ # by = [by]
2454
+
2455
+ xarrs: dict[str, DataArray] = {}
2456
+ for (bounds, band_id), collection in self.groupby(["bounds", "band_id"]):
2457
+ name = f"{band_id}_{'-'.join(str(int(x)) for x in bounds)}"
2458
+ first_band = collection[0][0]
2459
+ coords = _generate_spatial_coords(
2460
+ first_band.transform, first_band.width, first_band.height
2461
+ )
2462
+ values = np.array([band.to_numpy() for img in collection for band in img])
2463
+ assert len(values) == len(collection)
2464
+
2465
+ # coords["band_id"] = [
2466
+ # band.band_id or i for i, band in enumerate(collection[0])
2467
+ # ]
2468
+ for attr in by:
2469
+ coords[attr] = [getattr(img, attr) for img in collection]
2470
+ # coords["band"] = band_id #
2471
+
2472
+ dims = [*by, "y", "x"]
2473
+ # dims = ["band", "y", "x"]
2474
+ # dims = {}
2475
+ # for attr in by:
2476
+ # dims[attr] = [getattr(img, attr) for img in collection]
2477
+
2478
+ xarrs[name] = DataArray(
2479
+ values,
2480
+ coords=coords,
2481
+ dims=dims,
2482
+ # name=name,
2483
+ name=band_id,
2484
+ attrs={
2485
+ "crs": collection.crs,
2486
+ "band_id": band_id,
2487
+ }, # , "bounds": bounds},
2488
+ **kwargs,
2489
+ )
2218
2490
 
2219
- first_band = self[0][0]
2220
- coords = _generate_spatial_coords(
2221
- first_band.transform, first_band.width, first_band.height
2222
- )
2223
- dims = ["image", "band", "y", "x"]
2224
- return xr.DataArray(
2225
- self.values,
2226
- coords=coords,
2227
- dims=dims,
2228
- name=name,
2229
- attrs={"crs": self.crs},
2230
- )
2491
+ return xr.combine_by_coords(list(xarrs.values()))
2492
+ # return Dataset(xarrs)
2231
2493
 
2232
2494
  def to_gdfs(self, column: str = "value") -> dict[str, GeoDataFrame]:
2233
2495
  """Convert each band in each Image to a GeoDataFrame."""
@@ -2241,8 +2503,6 @@ class ImageCollection(_ImageBase):
2241
2503
  except AttributeError:
2242
2504
  name = f"{self.__class__.__name__}({i})"
2243
2505
 
2244
- # band.load()
2245
-
2246
2506
  if name not in out:
2247
2507
  out[name] = band.to_gdf(column=column)
2248
2508
  return out
@@ -2384,36 +2644,22 @@ class ImageCollection(_ImageBase):
2384
2644
  masking=self.masking,
2385
2645
  **self._common_init_kwargs,
2386
2646
  )
2647
+
2387
2648
  if self.masking is not None:
2388
2649
  images = []
2389
2650
  for image in self._images:
2651
+ # TODO why this loop?
2390
2652
  try:
2391
2653
  if not isinstance(image.mask, Band):
2392
2654
  raise ValueError()
2393
2655
  images.append(image)
2394
- except ValueError:
2656
+ except ValueError as e:
2657
+ raise e
2395
2658
  continue
2396
2659
  self._images = images
2397
2660
  for image in self._images:
2398
2661
  image._bands = [band for band in image if band.band_id is not None]
2399
2662
 
2400
- if self.metadata is not None:
2401
- attributes_to_add = ["crs", "bounds"] + list(self.metadata_attributes)
2402
- for img in self:
2403
- for band in img:
2404
- for key in attributes_to_add:
2405
- try:
2406
- value = self.metadata[band.path][key]
2407
- except KeyError:
2408
- try:
2409
- value = self.metadata[key][band.path]
2410
- except KeyError:
2411
- continue
2412
- try:
2413
- setattr(band, key, value)
2414
- except Exception:
2415
- setattr(band, f"_{key}", value)
2416
-
2417
2663
  self._images = [img for img in self if len(img)]
2418
2664
 
2419
2665
  if self._should_be_sorted:
@@ -2438,7 +2684,7 @@ class ImageCollection(_ImageBase):
2438
2684
  and sort_group in _get_non_optional_groups(pat)
2439
2685
  for pat in self.image_patterns
2440
2686
  )
2441
- or all(img.date is not None for img in self)
2687
+ or all(getattr(img, sort_group) is not None for img in self)
2442
2688
  )
2443
2689
 
2444
2690
  @images.setter
@@ -2449,7 +2695,18 @@ class ImageCollection(_ImageBase):
2449
2695
 
2450
2696
  def __repr__(self) -> str:
2451
2697
  """String representation."""
2452
- return f"{self.__class__.__name__}({len(self)}, path='{self.path}')"
2698
+ root = ""
2699
+ if self.path is not None:
2700
+ data = f"'{self.path}'"
2701
+ elif all(img.path is not None for img in self):
2702
+ data = [img.path for img in self]
2703
+ parents = {str(Path(path).parent) for path in data}
2704
+ if len(parents) == 1:
2705
+ data = [Path(path).name for path in data]
2706
+ root = f" root='{next(iter(parents))}',"
2707
+ else:
2708
+ data = [img for img in self]
2709
+ return f"{self.__class__.__name__}({data},{root} res={self.res}, level='{self.level}')"
2453
2710
 
2454
2711
  def union_all(self) -> Polygon | MultiPolygon:
2455
2712
  """(Multi)Polygon representing the union of all image bounds."""
@@ -2500,12 +2757,8 @@ class ImageCollection(_ImageBase):
2500
2757
 
2501
2758
  alpha = 1 - p
2502
2759
 
2503
- # for img in self:
2504
- # for band in img:
2505
- # band.load()
2506
-
2507
2760
  for group_values, subcollection in self.groupby(by):
2508
- print("group_values:", *group_values)
2761
+ print("subcollection group values:", group_values)
2509
2762
 
2510
2763
  if "date" in x_var and subcollection._should_be_sorted:
2511
2764
  subcollection._images = list(sorted(subcollection._images))
@@ -2519,6 +2772,7 @@ class ImageCollection(_ImageBase):
2519
2772
  for band in img
2520
2773
  ]
2521
2774
  )
2775
+ first_date = pd.Timestamp(x[0])
2522
2776
  x = (
2523
2777
  pd.to_datetime(
2524
2778
  [band.date[:8] for img in subcollection for band in img]
@@ -2611,6 +2865,23 @@ class ImageCollection(_ImageBase):
2611
2865
  )
2612
2866
  plt.xlabel(x_var)
2613
2867
  plt.ylabel(y_label)
2868
+
2869
+ if x_var == "date":
2870
+ date_labels = pd.to_datetime(
2871
+ [first_date + pd.Timedelta(days=int(day)) for day in this_x]
2872
+ )
2873
+
2874
+ _, unique_indices = np.unique(
2875
+ date_labels.strftime("%Y-%m"), return_index=True
2876
+ )
2877
+
2878
+ unique_x = np.array(this_x)[unique_indices]
2879
+ unique_labels = date_labels[unique_indices].strftime("%Y-%m")
2880
+
2881
+ ax.set_xticks(unique_x)
2882
+ ax.set_xticklabels(unique_labels, rotation=45, ha="right")
2883
+ # ax.tick_params(axis="x", length=10, width=2)
2884
+
2614
2885
  plt.show()
2615
2886
 
2616
2887
 
@@ -2629,10 +2900,7 @@ class Sentinel2Config:
2629
2900
  """Holder of Sentinel 2 regexes, band_ids etc."""
2630
2901
 
2631
2902
  image_regexes: ClassVar[str] = (config.SENTINEL2_IMAGE_REGEX,)
2632
- filename_regexes: ClassVar[str] = (
2633
- config.SENTINEL2_FILENAME_REGEX,
2634
- config.SENTINEL2_CLOUD_FILENAME_REGEX,
2635
- )
2903
+ filename_regexes: ClassVar[str] = (config.SENTINEL2_FILENAME_REGEX,)
2636
2904
  metadata_attributes: ClassVar[
2637
2905
  dict[str, Callable | functools.partial | tuple[str]]
2638
2906
  ] = {
@@ -2640,22 +2908,69 @@ class Sentinel2Config:
2640
2908
  _extract_regex_match_from_string,
2641
2909
  regexes=(r"<PROCESSING_BASELINE>(.*?)</PROCESSING_BASELINE>",),
2642
2910
  ),
2643
- "cloud_coverage_percentage": "_get_cloud_coverage_percentage",
2644
- "is_refined": functools.partial(
2645
- _any_regex_matches, regexes=(r'<Image_Refining flag="REFINED">',)
2646
- ),
2647
- "boa_add_offset": "_get_boa_add_offset_dict",
2911
+ "cloud_cover_percentage": "_get_cloud_cover_percentage",
2912
+ "is_refined": "_get_image_refining_flag",
2913
+ "boa_quantification_value": "_get_boa_quantification_value",
2914
+ }
2915
+ l1c_bands: ClassVar[set[str]] = {
2916
+ "B01": 60,
2917
+ "B02": 10,
2918
+ "B03": 10,
2919
+ "B04": 10,
2920
+ "B05": 20,
2921
+ "B06": 20,
2922
+ "B07": 20,
2923
+ "B08": 10,
2924
+ "B8A": 20,
2925
+ "B09": 60,
2926
+ "B10": 60,
2927
+ "B11": 20,
2928
+ "B12": 20,
2648
2929
  }
2649
- all_bands: ClassVar[list[str]] = list(config.SENTINEL2_BANDS)
2650
- rbg_bands: ClassVar[list[str]] = config.SENTINEL2_RBG_BANDS
2651
- ndvi_bands: ClassVar[list[str]] = config.SENTINEL2_NDVI_BANDS
2652
- l2a_bands: ClassVar[dict[str, int]] = config.SENTINEL2_L2A_BANDS
2653
- l1c_bands: ClassVar[dict[str, int]] = config.SENTINEL2_L1C_BANDS
2930
+ l2a_bands: ClassVar[set[str]] = {
2931
+ key: res for key, res in l1c_bands.items() if key != "B10"
2932
+ }
2933
+ all_bands: ClassVar[set[str]] = l1c_bands
2934
+ rbg_bands: ClassVar[tuple[str]] = ("B04", "B02", "B03")
2935
+ ndvi_bands: ClassVar[tuple[str]] = ("B04", "B08")
2654
2936
  masking: ClassVar[BandMasking] = BandMasking(
2655
- band_id="SCL", values=(3, 8, 9, 10, 11)
2937
+ band_id="SCL",
2938
+ values={
2939
+ 2: "Topographic casted shadows",
2940
+ 3: "Cloud shadows",
2941
+ 8: "Cloud medium probability",
2942
+ 9: "Cloud high probability",
2943
+ 10: "Thin cirrus",
2944
+ 11: "Snow or ice",
2945
+ },
2656
2946
  )
2657
2947
 
2658
- def _get_cloud_coverage_percentage(self, xml_file: str) -> float:
2948
+ def _get_image_refining_flag(self, xml_file: str) -> bool:
2949
+ match_ = re.search(
2950
+ r'Image_Refining flag="(?:REFINED|NOT_REFINED)"',
2951
+ xml_file,
2952
+ )
2953
+ if match_ is None:
2954
+ raise _RegexError()
2955
+
2956
+ if "NOT_REFINED" in match_.group(0):
2957
+ return False
2958
+ elif "REFINED" in match_.group(0):
2959
+ return True
2960
+ else:
2961
+ raise _RegexError()
2962
+
2963
+ def _get_boa_quantification_value(self, xml_file: str) -> int:
2964
+ return int(
2965
+ _extract_regex_match_from_string(
2966
+ xml_file,
2967
+ (
2968
+ r'<BOA_QUANTIFICATION_VALUE unit="none">-?(\d+)</BOA_QUANTIFICATION_VALUE>',
2969
+ ),
2970
+ )
2971
+ )
2972
+
2973
+ def _get_cloud_cover_percentage(self, xml_file: str) -> float:
2659
2974
  return float(
2660
2975
  _extract_regex_match_from_string(
2661
2976
  xml_file,
@@ -2666,7 +2981,35 @@ class Sentinel2Config:
2666
2981
  )
2667
2982
  )
2668
2983
 
2669
- def _get_boa_add_offset_dict(self, xml_file: str) -> BandIdDict:
2984
+
2985
+ class Sentinel2CloudlessConfig(Sentinel2Config):
2986
+ """Holder of regexes, band_ids etc. for Sentinel 2 cloudless mosaic."""
2987
+
2988
+ image_regexes: ClassVar[str] = (config.SENTINEL2_MOSAIC_IMAGE_REGEX,)
2989
+ filename_regexes: ClassVar[str] = (config.SENTINEL2_MOSAIC_FILENAME_REGEX,)
2990
+ masking: ClassVar[None] = None
2991
+ all_bands: ClassVar[list[str]] = [
2992
+ x.replace("B0", "B") for x in Sentinel2Config.all_bands
2993
+ ]
2994
+ rbg_bands: ClassVar[dict[str, str]] = {
2995
+ key.replace("B0", "B") for key in Sentinel2Config.rbg_bands
2996
+ }
2997
+ ndvi_bands: ClassVar[dict[str, str]] = {
2998
+ key.replace("B0", "B") for key in Sentinel2Config.ndvi_bands
2999
+ }
3000
+
3001
+
3002
+ class Sentinel2Band(Sentinel2Config, Band):
3003
+ """Band with Sentinel2 specific name variables and regexes."""
3004
+
3005
+ metadata_attributes = Sentinel2Config.metadata_attributes | {
3006
+ "boa_add_offset": "_get_boa_add_offset_dict",
3007
+ }
3008
+
3009
+ def _get_boa_add_offset_dict(self, xml_file: str) -> int | None:
3010
+ if self.is_mask:
3011
+ return None
3012
+
2670
3013
  pat = re.compile(
2671
3014
  r"""
2672
3015
  <BOA_ADD_OFFSET\s*
@@ -2683,30 +3026,39 @@ class Sentinel2Config:
2683
3026
  raise _RegexError(f"Could not find boa_add_offset info from {pat}") from e
2684
3027
  if not matches:
2685
3028
  raise _RegexError(f"Could not find boa_add_offset info from {pat}")
2686
- return BandIdDict(
3029
+
3030
+ dict_ = (
2687
3031
  pd.DataFrame(matches).set_index("band_id")["value"].astype(int).to_dict()
2688
3032
  )
2689
3033
 
3034
+ # some xml files have band ids in range index form
3035
+ # converting these to actual band ids (B01 etc.)
3036
+ is_integer_coded = [int(i) for i in dict_] == list(range(len(dict_)))
2690
3037
 
2691
- class Sentinel2CloudlessConfig(Sentinel2Config):
2692
- """Holder of regexes, band_ids etc. for Sentinel 2 cloudless mosaic."""
2693
-
2694
- image_regexes: ClassVar[str] = (config.SENTINEL2_MOSAIC_IMAGE_REGEX,)
2695
- filename_regexes: ClassVar[str] = (config.SENTINEL2_MOSAIC_FILENAME_REGEX,)
2696
- masking: ClassVar[None] = None
2697
- all_bands: ClassVar[list[str]] = [
2698
- x.replace("B0", "B") for x in Sentinel2Config.all_bands
2699
- ]
2700
- rbg_bands: ClassVar[list[str]] = [
2701
- x.replace("B0", "B") for x in Sentinel2Config.rbg_bands
2702
- ]
2703
- ndvi_bands: ClassVar[list[str]] = [
2704
- x.replace("B0", "B") for x in Sentinel2Config.ndvi_bands
2705
- ]
2706
-
3038
+ if is_integer_coded:
3039
+ # the xml files contain 13 bandIds for both L1C and L2A
3040
+ # eventhough L2A doesn't have band B10
3041
+ all_bands = list(self.l1c_bands)
3042
+ if len(all_bands) != len(dict_):
3043
+ raise ValueError(
3044
+ f"Different number of bands in xml file and config for {self.name}: {all_bands}, {list(dict_)}"
3045
+ )
3046
+ dict_ = {
3047
+ band_id: value
3048
+ for band_id, value in zip(all_bands, dict_.values(), strict=True)
3049
+ }
2707
3050
 
2708
- class Sentinel2Band(Sentinel2Config, Band):
2709
- """Band with Sentinel2 specific name variables and regexes."""
3051
+ try:
3052
+ return dict_[self.band_id]
3053
+ except KeyError as e:
3054
+ band_id = self.band_id.upper()
3055
+ for txt in ["B0", "B", "A"]:
3056
+ band_id = band_id.replace(txt, "")
3057
+ try:
3058
+ return dict_[band_id]
3059
+ except KeyError:
3060
+ continue
3061
+ raise KeyError(self.band_id, dict_) from e
2710
3062
 
2711
3063
 
2712
3064
  class Sentinel2Image(Sentinel2Config, Image):
@@ -2716,12 +3068,15 @@ class Sentinel2Image(Sentinel2Config, Image):
2716
3068
 
2717
3069
  def ndvi(
2718
3070
  self,
2719
- red_band: str = Sentinel2Config.ndvi_bands[0],
2720
- nir_band: str = Sentinel2Config.ndvi_bands[1],
3071
+ red_band: str = "B04",
3072
+ nir_band: str = "B08",
3073
+ padding: int = 0,
2721
3074
  copy: bool = True,
2722
3075
  ) -> NDVIBand:
2723
3076
  """Calculate the NDVI for the Image."""
2724
- return super().ndvi(red_band=red_band, nir_band=nir_band, copy=copy)
3077
+ return super().ndvi(
3078
+ red_band=red_band, nir_band=nir_band, padding=padding, copy=copy
3079
+ )
2725
3080
 
2726
3081
 
2727
3082
  class Sentinel2Collection(Sentinel2Config, ImageCollection):
@@ -2732,8 +3087,8 @@ class Sentinel2Collection(Sentinel2Config, ImageCollection):
2732
3087
 
2733
3088
  def __init__(self, data: str | Path | Sequence[Image], **kwargs) -> None:
2734
3089
  """ImageCollection with Sentinel2 specific name variables and path regexes."""
2735
- level = kwargs.get("level", NoLevel)
2736
- if isinstance(level, type) and isinstance(level(), NoLevel):
3090
+ level = kwargs.get("level", None_)
3091
+ if callable(level) and isinstance(level(), None_):
2737
3092
  raise ValueError("Must specify level for Sentinel2Collection.")
2738
3093
  super().__init__(data=data, **kwargs)
2739
3094
 
@@ -2797,29 +3152,6 @@ def _get_gradient(band: Band, degrees: bool = False, copy: bool = True) -> Band:
2797
3152
  raise ValueError("array must be 2 or 3 dimensional")
2798
3153
 
2799
3154
 
2800
- def to_xarray(
2801
- array: np.ndarray, transform: Affine, crs: Any, name: str | None = None
2802
- ) -> DataArray:
2803
- """Convert the raster to an xarray.DataArray."""
2804
- if len(array.shape) == 2:
2805
- height, width = array.shape
2806
- dims = ["y", "x"]
2807
- elif len(array.shape) == 3:
2808
- height, width = array.shape[1:]
2809
- dims = ["band", "y", "x"]
2810
- else:
2811
- raise ValueError(f"Array should be 2 or 3 dimensional. Got shape {array.shape}")
2812
-
2813
- coords = _generate_spatial_coords(transform, width, height)
2814
- return xr.DataArray(
2815
- array,
2816
- coords=coords,
2817
- dims=dims,
2818
- name=name,
2819
- attrs={"crs": crs},
2820
- )
2821
-
2822
-
2823
3155
  def _slope_2d(array: np.ndarray, res: int, degrees: int) -> np.ndarray:
2824
3156
  gradient_x, gradient_y = np.gradient(array, res, res)
2825
3157
 
@@ -2836,47 +3168,31 @@ def _slope_2d(array: np.ndarray, res: int, degrees: int) -> np.ndarray:
2836
3168
  return degrees
2837
3169
 
2838
3170
 
2839
- def _clip_loaded_array(
2840
- arr: np.ndarray,
2841
- bounds: tuple[int, int, int, int],
2842
- transform: Affine,
3171
+ def _clip_xarray(
3172
+ xarr: DataArray,
3173
+ mask: tuple[int, int, int, int],
2843
3174
  crs: Any,
2844
- out_shape: tuple[int, int],
2845
3175
  **kwargs,
2846
- ) -> np.ndarray:
3176
+ ) -> DataArray:
2847
3177
  # xarray needs a numpy array of polygons
2848
- bounds_arr: np.ndarray = GeoSeries([to_shapely(bounds)]).values
3178
+ mask_arr: np.ndarray = to_geoseries(mask).values
2849
3179
  try:
2850
-
2851
- while out_shape != arr.shape:
2852
- arr = (
2853
- to_xarray(
2854
- arr,
2855
- transform=transform,
2856
- crs=crs,
2857
- )
2858
- .rio.clip(bounds_arr, crs=crs, **kwargs)
2859
- .to_numpy()
2860
- )
2861
- # bounds_arr = bounds_arr.buffer(0.0000001)
2862
- return arr
2863
-
3180
+ return xarr.rio.clip(
3181
+ mask_arr,
3182
+ crs=crs,
3183
+ **kwargs,
3184
+ )
2864
3185
  except NoDataInBounds:
2865
3186
  return np.array([])
2866
3187
 
2867
3188
 
2868
- def _fix_path(path: str) -> str:
2869
- return (
2870
- str(path).replace("\\", "/").replace(r"\"", "/").replace("//", "/").rstrip("/")
2871
- )
2872
-
2873
-
2874
- def _get_all_file_paths(path: str) -> list[str]:
3189
+ def _get_all_file_paths(path: str) -> set[str]:
2875
3190
  if is_dapla():
2876
- return list(sorted(set(_glob_func(path + "/**"))))
3191
+ return {_fix_path(x) for x in sorted(set(_glob_func(path + "/**")))}
2877
3192
  else:
2878
- return list(
2879
- sorted(
3193
+ return {
3194
+ _fix_path(x)
3195
+ for x in sorted(
2880
3196
  set(
2881
3197
  _glob_func(path + "/**")
2882
3198
  + _glob_func(path + "/**/**")
@@ -2885,7 +3201,7 @@ def _get_all_file_paths(path: str) -> list[str]:
2885
3201
  + _glob_func(path + "/**/**/**/**/**")
2886
3202
  )
2887
3203
  )
2888
- )
3204
+ }
2889
3205
 
2890
3206
 
2891
3207
  def _get_images(
@@ -2900,9 +3216,8 @@ def _get_images(
2900
3216
  masking: BandMasking | None,
2901
3217
  **kwargs,
2902
3218
  ) -> list[Image]:
2903
-
2904
- with joblib.Parallel(n_jobs=processes, backend="loky") as parallel:
2905
- images = parallel(
3219
+ with joblib.Parallel(n_jobs=processes, backend="threading") as parallel:
3220
+ images: list[Image] = parallel(
2906
3221
  joblib.delayed(image_class)(
2907
3222
  path,
2908
3223
  df=df,
@@ -3017,13 +3332,13 @@ def _copy_and_add_df_parallel(
3017
3332
  return (i, copied)
3018
3333
 
3019
3334
 
3020
- def _get_bounds(bounds, bbox) -> None | Polygon:
3335
+ def _get_bounds(bounds, bbox, band_bounds: Polygon) -> None | Polygon:
3021
3336
  if bounds is None and bbox is None:
3022
3337
  return None
3023
3338
  elif bounds is not None and bbox is None:
3024
- return to_shapely(bounds) # .intersection(self.union_all())
3339
+ return to_shapely(bounds).intersection(band_bounds)
3025
3340
  elif bounds is None and bbox is not None:
3026
- return to_shapely(bbox) # .intersection(self.union_all())
3341
+ return to_shapely(bbox).intersection(band_bounds)
3027
3342
  else:
3028
3343
  return to_shapely(bounds).intersection(to_shapely(bbox))
3029
3344
 
@@ -3041,7 +3356,15 @@ def _open_raster(path: str | Path) -> rasterio.io.DatasetReader:
3041
3356
 
3042
3357
 
3043
3358
  def _load_band(band: Band, **kwargs) -> None:
3044
- band.load(**kwargs)
3359
+ return band.load(**kwargs)
3360
+
3361
+
3362
+ def _band_apply(band: Band, func: Callable, **kwargs) -> None:
3363
+ return band.apply(func, **kwargs)
3364
+
3365
+
3366
+ def _clip_band(band: Band, mask, **kwargs) -> None:
3367
+ return band.clip(mask, **kwargs)
3045
3368
 
3046
3369
 
3047
3370
  def _merge_by_band(collection: ImageCollection, **kwargs) -> Image:
@@ -3053,7 +3376,7 @@ def _merge(collection: ImageCollection, **kwargs) -> Band:
3053
3376
 
3054
3377
 
3055
3378
  def _zonal_one_pair(i: int, poly: Polygon, band: Band, aggfunc, array_func, func_names):
3056
- clipped = band.copy().load(bounds=poly)
3379
+ clipped = band.copy().clip(poly)
3057
3380
  if not np.size(clipped.values):
3058
3381
  return _no_overlap_df(func_names, i, date=band.date)
3059
3382
  return _aggregate(clipped.values, array_func, aggfunc, func_names, band.date, i)