ssb-sgis 1.0.5__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sgis/raster/cube.py DELETED
@@ -1,1274 +0,0 @@
1
- import functools
2
- import itertools
3
- import multiprocessing
4
- import re
5
- import warnings
6
- from collections.abc import Callable
7
- from collections.abc import Iterable
8
- from collections.abc import Iterator
9
- from copy import copy
10
- from copy import deepcopy
11
- from pathlib import Path
12
- from typing import Any
13
- from typing import ClassVar
14
-
15
- import geopandas as gpd
16
- import numpy as np
17
- import pandas as pd
18
- import pyproj
19
- import rasterio
20
- import shapely
21
- from geopandas import GeoDataFrame
22
- from geopandas import GeoSeries
23
- from pandas import DataFrame
24
- from pandas import Series
25
- from pandas.api.types import is_dict_like
26
- from pandas.api.types import is_list_like
27
- from rasterio import merge as rasterio_merge
28
-
29
- try:
30
- import xarray as xr
31
- from xarray import Dataset
32
- except ImportError:
33
-
34
- class Dataset:
35
- """Placeholder."""
36
-
37
-
38
- from rtree.index import Index
39
- from rtree.index import Property
40
- from shapely import Geometry
41
- from typing_extensions import Self # TODO: imperter fra typing når python 3.11
42
-
43
- from ..geopandas_tools.bounds import make_grid
44
- from ..geopandas_tools.conversion import is_bbox_like
45
- from ..geopandas_tools.conversion import to_bbox
46
- from ..geopandas_tools.conversion import to_shapely
47
- from ..geopandas_tools.general import get_common_crs
48
- from ..geopandas_tools.overlay import clean_overlay
49
- from ..helpers import get_all_files
50
- from ..helpers import get_numpy_func
51
- from ..io._is_dapla import is_dapla
52
- from ..io.opener import opener
53
- from ..parallel.parallel import Parallel
54
- from .raster import Raster
55
-
56
- try:
57
- from torchgeo.datasets.geo import RasterDataset
58
- from torchgeo.datasets.utils import BoundingBox
59
- except ImportError:
60
-
61
- class BoundingBox:
62
- """Placeholder."""
63
-
64
- def __init__(self, *args, **kwargs) -> None:
65
- """Placeholder."""
66
- raise ImportError("missing optional dependency 'torchgeo'")
67
-
68
- class RasterDataset:
69
- """Placeholder."""
70
-
71
- def __init__(self, *args, **kwargs) -> None:
72
- """Placeholder."""
73
- raise ImportError("missing optional dependency 'torchgeo'")
74
-
75
-
76
- try:
77
- import torch
78
- from torchgeo.datasets.utils import disambiguate_timestamp
79
- except ImportError:
80
-
81
- class torch:
82
- """Placeholder."""
83
-
84
- class Tensor:
85
- """Placeholder to reference torch.Tensor."""
86
-
87
-
88
- try:
89
- from ..io.dapla_functions import read_geopandas
90
- except ImportError:
91
- pass
92
-
93
- try:
94
- from dapla import FileClient
95
- from dapla import write_pandas
96
- except ImportError:
97
- pass
98
-
99
- from .base import ALLOWED_KEYS
100
- from .base import NESSECARY_META
101
- from .base import get_index_mapper
102
- from .cubebase import _from_gdf_func
103
- from .cubebase import _method_as_func
104
- from .cubebase import _raster_from_path
105
- from .cubebase import _write_func
106
- from .indices import get_raster_pairs
107
- from .indices import index_calc_pair
108
- from .zonal import _make_geometry_iterrows
109
- from .zonal import _prepare_zonal
110
- from .zonal import _zonal_func
111
- from .zonal import _zonal_post
112
-
113
- TORCHGEO_RETURN_TYPE = dict[str, torch.Tensor | pyproj.CRS | BoundingBox]
114
-
115
-
116
- class DataCube:
117
- """Experimental."""
118
-
119
- CUBE_DF_NAME: ClassVar[str] = "cube_df.parquet"
120
-
121
- separate_files: ClassVar[bool] = True
122
- is_image: ClassVar[bool] = True
123
- date_format: ClassVar[str | None] = None
124
-
125
- def __init__(
126
- self,
127
- data: Iterable[Raster] | None = None,
128
- crs: Any | None = None,
129
- res: int | None = None,
130
- nodata: int | None = None,
131
- copy: bool = False,
132
- parallelizer: Parallel | None = None,
133
- ) -> None:
134
- """Initialize a DataCube instance with optional Raster data.
135
-
136
- Args:
137
- data: Iterable of Raster objects or a single DataCube to copy data from.
138
- crs: Coordinate reference system to be applied to the images.
139
- res: Spatial resolution of the images, applied uniformly to all Rasters.
140
- nodata: Nodata value to unify across all Rasters within the cube.
141
- copy: If True, makes deep copies of Rasters provided.
142
- parallelizer: sgis.Parallel instance to handle concurrent operations.
143
- """
144
- warnings.warn(
145
- "This class is deprecated in favor of ImageCollection", stacklevel=1
146
- )
147
-
148
- self._arrays = None
149
- self._res = res
150
- self.parallelizer = parallelizer
151
-
152
- # hasattr check to allow class attribute
153
- if not hasattr(self, "_nodata"):
154
- self._nodata = nodata
155
-
156
- if isinstance(data, DataCube):
157
- for key, value in data.__dict__.items():
158
- setattr(self, key, value)
159
- return
160
- elif data is None:
161
- self.data = data
162
- self._crs = None
163
- return
164
- elif not is_list_like(data) and all(isinstance(r, Raster) for r in data):
165
- raise TypeError(
166
- "'data' must be a Raster instance or an iterable."
167
- f"Got {type(data)}: {data}"
168
- )
169
- else:
170
- data = list(data)
171
-
172
- if copy:
173
- data = [raster.copy() for raster in data]
174
- else:
175
- # take a copy only if there are gdfs with the same memory address
176
- if sum(r1 is r2 for r1 in data for r2 in data) < len(data):
177
- data = [raster.copy() for raster in data]
178
-
179
- self.data = data
180
-
181
- nodatas = {r.nodata for r in self}
182
- if self.nodata is None and len(nodatas) > 1:
183
- raise ValueError(
184
- "Must specify 'nodata' when the images have different nodata values. "
185
- f"Got {', '.join([str(x) for x in nodatas])}"
186
- )
187
-
188
- resolutions = {r.res for r in self}
189
- if self._res is None and len(resolutions) > 1:
190
- raise ValueError(
191
- "Must specify 'res' when the images have different resolutions. "
192
- f"Got {', '.join([str(x) for x in resolutions])}"
193
- )
194
- elif res is None and len(resolutions):
195
- self._res = resolutions.pop()
196
-
197
- if crs:
198
- self._crs = pyproj.CRS(crs)
199
- if not all(self._crs.equals(pyproj.CRS(r.crs)) for r in self.data):
200
- self = self.to_crs(self._crs)
201
- try:
202
- self._crs = get_common_crs(self.data)
203
- except (ValueError, IndexError):
204
- self._crs = None
205
-
206
- @classmethod
207
- def from_root(
208
- cls,
209
- root: str | Path,
210
- *,
211
- res: int | None = None,
212
- check_for_df: bool = True,
213
- contains: str | None = None,
214
- endswith: str = ".tif",
215
- bands: str | list[str] | None = None,
216
- filename_regex: str | None = None,
217
- parallelizer: Parallel | None = None,
218
- file_system=None,
219
- **kwargs,
220
- ) -> "DataCube":
221
- """Construct a DataCube by searching for files starting from a root directory.
222
-
223
- Args:
224
- root: Root directory path to search for raster image files.
225
- res: Resolution to unify the data within the cube.
226
- check_for_df: Check for a parquet file in the root directory
227
- that holds metadata for the files in the directory.
228
- contains: Filter files containing specific substrings.
229
- endswith: Filter files that end with specific substrings.
230
- bands: One or more band ids to keep.
231
- filename_regex: Regular expression to match file names
232
- and attributes (date, band, tile, resolution).
233
- parallelizer: sgis.Parallel instance for concurrent file processing.
234
- file_system: File system to use for file operations, used in GCS environment.
235
- **kwargs: Additional keyword arguments to pass to 'from_path' method.
236
-
237
- Returns:
238
- An instance of DataCube containing the raster data from specified paths.
239
- """
240
- kwargs["res"] = res
241
- kwargs["filename_regex"] = filename_regex
242
- kwargs["contains"] = contains
243
- kwargs["bands"] = bands
244
- kwargs["endswith"] = endswith
245
-
246
- if is_dapla():
247
- if file_system is None:
248
- file_system = FileClient.get_gcs_file_system()
249
- glob_pattern = str(Path(root) / "**")
250
- paths: list[str] = file_system.glob(glob_pattern)
251
- if contains:
252
- paths = [path for path in paths if contains in path]
253
-
254
- else:
255
- paths = get_all_files(root)
256
-
257
- dfs = [path for path in paths if path.endswith(cls.CUBE_DF_NAME)]
258
-
259
- if not check_for_df or not len(dfs):
260
- return cls.from_paths(
261
- paths,
262
- parallelizer=parallelizer,
263
- **kwargs,
264
- )
265
-
266
- folders_with_df: set[Path] = {Path(path).parent for path in dfs if path}
267
-
268
- cubes: list[DataCube] = [cls.from_cube_df(df, res=res) for df in dfs]
269
-
270
- paths_in_folders_without_df = [
271
- path for path in paths if Path(path).parent not in folders_with_df
272
- ]
273
-
274
- if paths_in_folders_without_df:
275
- cubes += [
276
- cls.from_paths(
277
- paths_in_folders_without_df,
278
- parallelizer=parallelizer,
279
- **kwargs,
280
- )
281
- ]
282
-
283
- return concat_cubes(cubes, res=res)
284
-
285
- @classmethod
286
- def from_paths(
287
- cls,
288
- paths: Iterable[str | Path],
289
- *,
290
- res: int | None = None,
291
- parallelizer: Parallel | None = None,
292
- file_system=None,
293
- contains: str | None = None,
294
- bands: str | list[str] | None = None,
295
- endswith: str = ".tif",
296
- filename_regex: str | None = None,
297
- **kwargs,
298
- ) -> "DataCube":
299
- """Create a DataCube from a list of file paths.
300
-
301
- Args:
302
- paths: Iterable of file paths to raster files.
303
- res: Resolution to unify the data within the cube.
304
- parallelizer: Joblib Parallel instance for concurrent file processing.
305
- file_system: File system to use for file operations, used in Dapla environment.
306
- contains: Filter files containing specific substrings.
307
- endswith: Filter files that end with specific substrings.
308
- bands: One or more band ids to keep.
309
- filename_regex: Regular expression to match file names.
310
- **kwargs: Additional keyword arguments to pass to the raster loading function.
311
-
312
- Returns:
313
- An instance of DataCube containing the raster data from specified paths.
314
- """
315
- crs = kwargs.pop("crs", None)
316
-
317
- if contains:
318
- paths = [path for path in paths if contains in path]
319
- if endswith:
320
- paths = [path for path in paths if path.endswith(endswith)]
321
- if filename_regex:
322
- compiled = re.compile(filename_regex, re.VERBOSE)
323
- paths = [path for path in paths if re.search(compiled, Path(path).name)]
324
- if bands:
325
- if isinstance(bands, str):
326
- bands = [bands]
327
- paths = [path for path in paths if any(band in str(path) for band in bands)]
328
-
329
- if not paths:
330
- return cls(crs=crs, parallelizer=parallelizer, res=res)
331
-
332
- kwargs["res"] = res
333
- kwargs["filename_regex"] = filename_regex
334
-
335
- if file_system is None and is_dapla():
336
- kwargs["file_system"] = FileClient.get_gcs_file_system()
337
-
338
- if parallelizer is None:
339
- rasters: list[Raster] = [
340
- _raster_from_path(path, **kwargs) for path in paths
341
- ]
342
- else:
343
- rasters: list[Raster] = parallelizer.map(
344
- _raster_from_path,
345
- paths,
346
- kwargs=kwargs,
347
- )
348
-
349
- return cls(rasters, copy=False, crs=crs, res=res)
350
-
351
- @classmethod
352
- def from_gdf(
353
- cls,
354
- gdf: GeoDataFrame | Iterable[GeoDataFrame],
355
- columns: str | Iterable[str],
356
- res: int,
357
- parallelizer: Parallel | None = None,
358
- tile_size: int | None = None,
359
- grid: GeoSeries | None = None,
360
- **kwargs,
361
- ) -> "DataCube":
362
- """Create a DataCube from a GeoDataFrame or a set of them, tiling the spatial data as specified.
363
-
364
- Args:
365
- gdf: GeoDataFrame or an iterable of GeoDataFrames to rasterize.
366
- columns: The column(s) in the GeoDataFrame that will be used as values for the rasterization.
367
- res: Spatial resolution of the output rasters.
368
- parallelizer: Joblib Parallel instance for concurrent processing.
369
- tile_size: Size of each tile/grid cell in the output raster.
370
- grid: Predefined grid to align the rasterization.
371
- **kwargs: Additional keyword arguments passed to Raster.from_gdf.
372
-
373
- Returns:
374
- An instance of DataCube containing rasterized data from the GeoDataFrame(s).
375
- """
376
- if grid is None and tile_size is None:
377
- raise ValueError("Must specify either 'tile_size' or 'grid'.")
378
-
379
- if isinstance(gdf, GeoDataFrame):
380
- gdf = [gdf]
381
- elif not all(isinstance(frame, GeoDataFrame) for frame in gdf):
382
- raise TypeError("gdf must be one or more GeoDataFrames.")
383
-
384
- if grid is None:
385
- crs = get_common_crs(gdf)
386
- total_bounds = shapely.union_all(
387
- [shapely.box(*frame.total_bounds) for frame in gdf]
388
- )
389
- grid = make_grid(total_bounds, gridsize=tile_size, crs=crs)
390
-
391
- grid["tile_idx"] = range(len(grid))
392
-
393
- partial_func = functools.partial(
394
- _from_gdf_func,
395
- columns=columns,
396
- res=res,
397
- **kwargs,
398
- )
399
-
400
- def to_gdf_list(gdf: GeoDataFrame) -> list[GeoDataFrame]:
401
- return [gdf.loc[gdf["tile_idx"] == i] for i in gdf["tile_idx"].unique()]
402
-
403
- rasters = []
404
-
405
- if parallelizer.processes > 1:
406
- rasters = parallelizer.map(
407
- clean_overlay, gdf, args=(grid,), kwargs=dict(keep_geom_type=True)
408
- )
409
- with multiprocessing.get_context("spawn").Pool(parallelizer.processes) as p:
410
- for frame in gdf:
411
- frame = frame.overlay(grid, keep_geom_type=True)
412
- gdfs = to_gdf_list(frame)
413
- rasters += p.map(partial_func, gdfs)
414
- elif parallelizer.processes < 1:
415
- raise ValueError("processes must be an integer 1 or greater.")
416
- else:
417
- for frame in gdf:
418
- frame = frame.overlay(grid, keep_geom_type=True)
419
- gdfs = to_gdf_list(frame)
420
- rasters += [partial_func(gdf) for gdf in gdfs]
421
-
422
- return cls(rasters, res=res)
423
-
424
- @classmethod
425
- def from_cube_df(
426
- cls, df: DataFrame | str | Path, res: int | None = None
427
- ) -> "DataCube":
428
- """Construct a DataCube from a DataFrame or path containing metadata or paths of rasters.
429
-
430
- Args:
431
- df: DataFrame, path to a DataFrame, or string path pointing to cube data.
432
- res: Optional resolution to standardize all rasters to this resolution.
433
-
434
- Returns:
435
- A DataCube instance containing the raster data described by the DataFrame.
436
- """
437
- if isinstance(df, (str, Path)):
438
- df = read_geopandas(df) if is_dapla() else gpd.read_parquet(df)
439
-
440
- # recursive
441
- if not is_dict_like(df) and all(
442
- isinstance(x, (str, Path, DataFrame)) for x in df
443
- ):
444
- cubes = [cls.from_cube_df(x) for x in df]
445
- cube = concat_cubes(cubes, res=res)
446
- return cube
447
-
448
- if isinstance(df, dict):
449
- df = DataFrame(df)
450
- elif not isinstance(df, DataFrame):
451
- raise TypeError("df must be DataFrame or file path to a parquet file.")
452
-
453
- rasters: list[Raster] = [
454
- Raster.from_dict(meta) for _, meta in (df[NESSECARY_META].iterrows())
455
- ]
456
- return cls(rasters)
457
-
458
- def to_gdf(
459
- self, column: str | None = None, ignore_index: bool = False, concat: bool = True
460
- ) -> GeoDataFrame:
461
- """Convert DataCube to GeoDataFrame."""
462
- gdfs = self.run_raster_method("to_gdf", column=column, return_self=False)
463
-
464
- if concat:
465
- return pd.concat(gdfs, ignore_index=ignore_index)
466
- return gdfs
467
-
468
- def to_xarray(self) -> Dataset:
469
- """Convert DataCube to an xarray.Dataset."""
470
- return xr.Dataset({i: r.to_xarray() for i, r in enumerate(self.data)})
471
-
472
- def zonal(
473
- self,
474
- polygons: GeoDataFrame,
475
- aggfunc: str | Callable | list[Callable | str],
476
- array_func: Callable | None = None,
477
- by_date: bool | None = None,
478
- dropna: bool = True,
479
- ) -> GeoDataFrame:
480
- """Calculate zonal statistics within polygons."""
481
- idx_mapper, idx_name = get_index_mapper(polygons)
482
- polygons, aggfunc, func_names = _prepare_zonal(polygons, aggfunc)
483
- poly_iter = _make_geometry_iterrows(polygons)
484
-
485
- if by_date is None:
486
- by_date: bool = all(r.date is not None for r in self)
487
-
488
- if not self.parallelizer:
489
- aggregated: list[DataFrame] = [
490
- _zonal_func(
491
- poly,
492
- cube=self,
493
- array_func=array_func,
494
- aggfunc=aggfunc,
495
- func_names=func_names,
496
- by_date=by_date,
497
- )
498
- for poly in poly_iter
499
- ]
500
- else:
501
- aggregated: list[DataFrame] = self.parallelizer.map(
502
- _zonal_func,
503
- poly_iter,
504
- kwargs=dict(
505
- cube=self,
506
- array_func=array_func,
507
- aggfunc=aggfunc,
508
- func_names=func_names,
509
- by_date=by_date,
510
- ),
511
- )
512
-
513
- return _zonal_post(
514
- aggregated,
515
- polygons=polygons,
516
- idx_mapper=idx_mapper,
517
- idx_name=idx_name,
518
- dropna=dropna,
519
- )
520
-
521
- def gradient(self, degrees: bool = False) -> Self:
522
- """Get gradients in each image."""
523
- self.data = self.run_raster_method("gradient", degrees=degrees)
524
- return self
525
-
526
- def map(self, func: Callable, return_self: bool = True, **kwargs) -> Self:
527
- """Maps each raster array to a function.
528
-
529
- The function must take a numpy array as first positional argument,
530
- and return a single numpy array. The function should be defined in
531
- the leftmost indentation level. If in Jupyter, the function also
532
- have to be defined in and imported from another file.
533
- """
534
- self._check_for_array()
535
- if self.parallelizer:
536
- data = self.parallelizer.map(func, self.arrays, kwargs=kwargs)
537
- else:
538
- data = [func(arr, **kwargs) for arr in self.arrays]
539
- if not return_self:
540
- return data
541
- self.arrays = data
542
- return self
543
-
544
- def raster_map(self, func: Callable, return_self: bool = True, **kwargs) -> Self:
545
- """Maps each raster to a function.
546
-
547
- The function must take a Raster object as first positional argument,
548
- and return a single Raster object. The function should be defined in
549
- the leftmost indentation level. If in Jupyter, the function also
550
- have to be defined in and imported from another file.
551
- """
552
- if self.parallelizer:
553
- data = self.parallelizer.map(func, self, kwargs=kwargs)
554
- else:
555
- data = [func(r, **kwargs) for r in self]
556
- if not return_self:
557
- return data
558
- self.data = data
559
- return self
560
-
561
- def sample(self, n: int, copy: bool = True, **kwargs) -> Self:
562
- """Take n samples of the cube."""
563
- if self.crs is None:
564
- self._crs = get_common_crs(self.data)
565
-
566
- cube = self.copy() if copy else self
567
-
568
- cube.data = list(pd.Series(cube.data).sample(n))
569
-
570
- cube.data = cube.run_raster_method("load", **kwargs)
571
-
572
- return cube
573
-
574
- def load(self, copy: bool = True, **kwargs) -> Self:
575
- """Load all images as arrays into a DataCube copy."""
576
- if self.crs is None:
577
- self._crs = get_common_crs(self.data)
578
-
579
- cube = self.copy() if copy else self
580
-
581
- cube.data = cube.run_raster_method("load", **kwargs)
582
-
583
- return cube
584
-
585
- def intersection(self, other: Any, copy: bool = True) -> Self:
586
- """Select the images that intersect 'other'."""
587
- other = to_shapely(other)
588
- cube = self.copy() if copy else self
589
- cube = cube[cube.boxes.intersects(other)]
590
- return cube
591
-
592
- def sfilter(
593
- self, other: GeoDataFrame | GeoSeries | Geometry | tuple, copy: bool = True
594
- ) -> Self:
595
- """Spatially filter images by bounding box or geometry object."""
596
- other = to_shapely(other)
597
- cube = self.copy() if copy else self
598
- cube.data = [raster for raster in self if raster.union_all().intersects(other)]
599
- return cube
600
-
601
- def clip(
602
- self, mask: GeoDataFrame | GeoSeries | Geometry, copy: bool = True, **kwargs
603
- ) -> Self:
604
- """Clip the images by bounding box or geometry object."""
605
- if self.crs is None:
606
- self._crs = get_common_crs(self.data)
607
-
608
- if (
609
- hasattr(mask, "crs")
610
- and mask.crs
611
- and not pyproj.CRS(self.crs).equals(pyproj.CRS(mask.crs))
612
- ):
613
- raise ValueError("crs mismatch.")
614
-
615
- cube = self.copy() if copy else self
616
-
617
- cube = cube.sfilter(to_shapely(mask), copy=False)
618
-
619
- cube.data = cube.run_raster_method("clip", mask=mask, **kwargs)
620
- return cube
621
-
622
- def clipmerge(self, mask: GeoDataFrame | GeoSeries | Geometry, **kwargs) -> Self:
623
- """Clip the images and merge to one image."""
624
- return _clipmerge(self, mask, **kwargs)
625
-
626
- def merge_by_bounds(self, by: str | list[str] | None = None, **kwargs) -> Self:
627
- """Merge images with the same bounding box."""
628
- return _merge_by_bounds(self, by=by, **kwargs)
629
-
630
- def merge(self, by: str | list[str] | None = None, **kwargs) -> Self:
631
- """Merge all images to one."""
632
- return _merge(self, by=by, **kwargs)
633
-
634
- def explode(self) -> Self:
635
- """Convert from 3D to 2D arrays.
636
-
637
- Make multi-banded arrays (3d) into multiple single-banded arrays (2d).
638
- """
639
-
640
- def explode_one_raster(raster: Raster) -> list[Raster]:
641
- property_values = {key: getattr(raster, key) for key in raster.properties}
642
-
643
- all_meta = {
644
- key: value
645
- for key, value in (
646
- raster.__dict__ | raster.meta | property_values
647
- ).items()
648
- if key in ALLOWED_KEYS and key not in ["array", "indexes"]
649
- }
650
- if raster.values is None:
651
- return [
652
- raster.__class__.from_dict({"indexes": i} | all_meta)
653
- for i in raster.indexes_as_tuple()
654
- ]
655
- else:
656
- return [
657
- raster.__class__.from_dict(
658
- {"array": array, "indexes": i + 1} | all_meta
659
- )
660
- for i, array in enumerate(raster.array_list())
661
- ]
662
-
663
- self.data = list(
664
- itertools.chain.from_iterable(
665
- [explode_one_raster(raster) for raster in self]
666
- )
667
- )
668
- return self
669
-
670
- def dissolve_bands(self, aggfunc: Callable | str, copy: bool = True) -> Self:
671
- """Aggregate values in 3D arrays to a single value in a 2D array."""
672
- self._check_for_array()
673
- if not callable(aggfunc) and not isinstance(aggfunc, str):
674
- raise TypeError("Can only supply a single aggfunc")
675
-
676
- cube = self.copy() if copy else self
677
-
678
- aggfunc = get_numpy_func(aggfunc)
679
-
680
- cube = cube.map(aggfunc, axis=0)
681
- return cube
682
-
683
- def write(
684
- self,
685
- root: str,
686
- file_format: str = "tif",
687
- **kwargs,
688
- ) -> None:
689
- """Writes arrays as tif files and df with file info.
690
-
691
- This method should be run after the rasters have been clipped, merged or
692
- its array values have been recalculated.
693
-
694
- Args:
695
- root: Directory path where the images will be written to.
696
- file_format: File extension.
697
- **kwargs: Keyword arguments passed to rasterio.open.
698
-
699
- """
700
- self._check_for_array()
701
-
702
- if any(raster.name is None for raster in self):
703
- raise ValueError("")
704
-
705
- paths = [
706
- (Path(root) / raster.name).with_suffix(f".{file_format}") for raster in self
707
- ]
708
-
709
- if self.parallelizer:
710
- self.parallelizer.starmap(
711
- _write_func, zip(self, paths, strict=False), kwargs=kwargs
712
- )
713
- else:
714
- [
715
- _write_func(raster, path, **kwargs)
716
- for raster, path in zip(self, paths, strict=False)
717
- ]
718
-
719
- def write_df(self, folder: str) -> None:
720
- """Write metadata DataFrame."""
721
- df = pd.DataFrame(self.meta)
722
-
723
- folder = Path(folder)
724
- if not folder.is_dir():
725
- raise ValueError()
726
-
727
- if is_dapla():
728
- write_pandas(df, folder / self.CUBE_DF_NAME)
729
- else:
730
- df.to_parquet(folder / self.CUBE_DF_NAME)
731
-
732
- def calculate_index(
733
- self,
734
- index_func: Callable,
735
- band_name1: str,
736
- band_name2: str,
737
- copy: bool = True,
738
- **kwargs,
739
- ) -> Self:
740
- """Calculate an index based on a function."""
741
- cube = self.copy() if copy else self
742
-
743
- raster_pairs: list[tuple[Raster, Raster]] = get_raster_pairs(
744
- cube, band_name1=band_name1, band_name2=band_name2
745
- )
746
-
747
- kwargs = dict(index_formula=index_func) | kwargs
748
-
749
- if self.parallelizer:
750
- rasters = self.parallelizer.map(
751
- index_calc_pair, raster_pairs, kwargs=kwargs
752
- )
753
- else:
754
- rasters = [index_calc_pair(items, **kwargs) for items in raster_pairs]
755
-
756
- return cube.__class__(rasters)
757
-
758
- # def reproject_match(self) -> Self:
759
- # pass
760
-
761
- def to_crs(self, crs: Any, copy: bool = True) -> Self:
762
- """Reproject the coordinates of each image."""
763
- cube = self.copy() if copy else self
764
- cube.data = [r.to_crs(crs) for r in cube]
765
- cube._warped_crs = crs
766
- return cube
767
-
768
- def set_crs(
769
- self, crs: Any, allow_override: bool = False, copy: bool = True
770
- ) -> Self:
771
- """Set the CRS of each image."""
772
- cube = self.copy() if copy else self
773
- cube.data = [r.set_crs(crs, allow_override=allow_override) for r in cube]
774
- cube._warped_crs = crs
775
- return cube
776
-
777
- def min(self) -> Series:
778
- """Get minimum array values for each image."""
779
- return Series(
780
- self.run_raster_method("min"),
781
- name="min",
782
- )
783
-
784
- def max(self) -> Series:
785
- """Get maximum array values for each image."""
786
- return Series(
787
- self.run_raster_method("max"),
788
- name="max",
789
- )
790
-
791
- def raster_attribute(self, attribute: str) -> Series | GeoSeries:
792
- """Get a Raster attribute returned as values in a pandas.Series."""
793
- data = [getattr(r, attribute) for r in self]
794
- if any(isinstance(x, Geometry) for x in data):
795
- return GeoSeries(data, name=attribute)
796
- return Series(data, name=attribute)
797
-
798
- def run_raster_method(
799
- self, method: str, *args, copy: bool = True, return_self: bool = False, **kwargs
800
- ) -> Self:
801
- """Run a Raster method for each raster in the cube."""
802
- if not all(hasattr(r, method) for r in self):
803
- raise AttributeError(f"Raster has no method {method!r}.")
804
-
805
- method_as_func = functools.partial(
806
- _method_as_func, *args, method=method, **kwargs
807
- )
808
-
809
- cube = self.copy() if copy else self
810
-
811
- return cube.raster_map(method_as_func, return_self=return_self)
812
-
813
- @property
814
- def meta(self) -> list[dict]:
815
- """Get metadata property of each raster."""
816
- return [raster.meta for raster in self]
817
-
818
- # @property
819
- # def cube_df_meta(self) -> dict[list]:
820
- # return {
821
- # "path": [r.path for r in self],
822
- # "indexes": [r.indexes for r in self],
823
- # "type": [r.__class__.__name__ for r in self],
824
- # "bounds": [r.bounds for r in self],
825
- # "crs": [crs_to_string(r.crs) for r in self],
826
- # }
827
-
828
- @property
829
- def data(self) -> list[Raster]:
830
- """The Rasters as a list."""
831
- return self._data
832
-
833
- @data.setter
834
- def data(self, data: list[Raster]):
835
- self.index = Index(interleaved=False, properties=Property(dimension=3))
836
-
837
- if data is None or not len(data):
838
- self._data = []
839
- return
840
- if not all(isinstance(x, Raster) for x in data):
841
- types = {type(x).__name__ for x in data}
842
- raise TypeError(f"data must be Raster. Got {', '.join(types)}")
843
- self._data = list(data)
844
-
845
- for i, raster in enumerate(self._data):
846
- if raster.date:
847
- try:
848
- mint, maxt = disambiguate_timestamp(raster.date, self.date_format)
849
- except (NameError, TypeError):
850
- mint, maxt = 0, 1
851
- else:
852
- mint, maxt = 0, 1
853
- # important: torchgeo has a different order of the bbox than shapely and geopandas
854
- minx, miny, maxx, maxy = raster.bounds
855
- self.index.insert(i, (minx, maxx, miny, maxy, mint, maxt))
856
-
857
- @property
858
- def arrays(self) -> list[np.ndarray]:
859
- """The arrays of the images as a list."""
860
- return [raster.values for raster in self]
861
-
862
- @arrays.setter
863
- def arrays(self, new_arrays: list[np.ndarray]):
864
- if len(new_arrays) != len(self):
865
- raise ValueError(
866
- f"Number of arrays ({len(new_arrays)}) must be same as length as cube ({len(self)})."
867
- )
868
- if not all(isinstance(arr, np.ndarray) for arr in new_arrays):
869
- raise ValueError("Must be list of numpy ndarrays")
870
-
871
- self.data = [
872
- raster.update(array=arr)
873
- for raster, arr in zip(self, new_arrays, strict=False)
874
- ]
875
-
876
- @property
877
- def band(self) -> Series:
878
- """Get the 'band' attribute of the rasters."""
879
- return Series(
880
- [r.band for r in self],
881
- name="band",
882
- )
883
-
884
- @property
885
- def dtype(self) -> Series:
886
- """Get the 'dtype' attribute of the rasters."""
887
- return Series(
888
- [r.dtype for r in self],
889
- name="dtype",
890
- )
891
-
892
- @property
893
- def nodata(self) -> int | None:
894
- """No data value."""
895
- return self._nodata
896
-
897
- @property
898
- def path(self) -> Series:
899
- """Get the 'path' attribute of the rasters."""
900
- return self.raster_attribute("path")
901
-
902
- @property
903
- def name(self) -> Series:
904
- """Get the 'name' attribute of the rasters."""
905
- return self.raster_attribute("name")
906
-
907
- @property
908
- def date(self) -> Series:
909
- """Get the 'date' attribute of the rasters."""
910
- return self.raster_attribute("date")
911
-
912
- @property
913
- def indexes(self) -> Series:
914
- """Get the 'indexes' attribute of the rasters."""
915
- return self.raster_attribute("indexes")
916
-
917
- # @property
918
- # def raster_id(self) -> Series:
919
- # return self.raster_attribute("raster_id")
920
-
921
- @property
922
- def area(self) -> Series:
923
- """Get the 'area' attribute of the rasters."""
924
- return self.raster_attribute("area")
925
-
926
- @property
927
- def length(self) -> Series:
928
- """Get the 'length' attribute of the rasters."""
929
- return self.raster_attribute("length")
930
-
931
- @property
932
- def height(self) -> Series:
933
- """Get the 'height' attribute of the rasters."""
934
- return self.raster_attribute("height")
935
-
936
- @property
937
- def width(self) -> Series:
938
- """Get the 'width' attribute of the rasters."""
939
- return self.raster_attribute("width")
940
-
941
- @property
942
- def shape(self) -> Series:
943
- """Get the 'shape' attribute of the rasters."""
944
- return self.raster_attribute("shape")
945
-
946
- @property
947
- def count(self) -> Series:
948
- """Get the 'count' attribute of the rasters."""
949
- return self.raster_attribute("count")
950
-
951
- @property
952
- def res(self) -> int:
953
- """Spatial resolution of the images."""
954
- return self._res
955
-
956
- @res.setter
957
- def res(self, value) -> None:
958
- self._res = value
959
-
960
- @property
961
- def crs(self) -> pyproj.CRS:
962
- """Coordinate reference system of the images."""
963
- crs = self._warped_crs if hasattr(self, "_warped_crs") else self._crs
964
- if crs is not None:
965
- return crs
966
- try:
967
- get_common_crs(self.data)
968
- except ValueError:
969
- return None
970
-
971
- @property
972
- def unary_union(self) -> Geometry:
973
- """Box polygon of the combined bounds of each image."""
974
- return shapely.union_all([shapely.box(*r.bounds) for r in self])
975
-
976
- @property
977
- def centroid(self) -> GeoSeries:
978
- """Get the 'centroid' attribute of the rasters."""
979
- return GeoSeries(
980
- [r.centroid for r in self],
981
- name="centroid",
982
- crs=self.crs,
983
- )
984
-
985
- @property
986
- def tile(self) -> Series:
987
- """Get the 'tile' attribute of the rasters."""
988
- return self.raster_attribute("tile")
989
-
990
- @property
991
- def boxes(self) -> GeoSeries:
992
- """Get the 'bounds' attribute of the rasters."""
993
- return GeoSeries(
994
- [shapely.box(*r.bounds) if r.bounds is not None else None for r in self],
995
- name="boxes",
996
- crs=self.crs,
997
- )
998
-
999
- @property
1000
- def total_bounds(self) -> tuple[float, float, float, float]:
1001
- """Combined minimum and maximum longitude and latitude."""
1002
- return tuple(x for x in self.boxes.total_bounds)
1003
-
1004
- @property
1005
- def bounds(self) -> BoundingBox:
1006
- """Pytorch bounds of the index.
1007
-
1008
- Returns:
1009
- (minx, maxx, miny, maxy, mint, maxt) of the dataset
1010
- """
1011
- return BoundingBox(*self.index.bounds)
1012
-
1013
- def copy(self, deep: bool = True) -> Self:
1014
- """Returns a (deep) copy of the class instance and its rasters.
1015
-
1016
- Args:
1017
- deep: Whether to return a deep or shallow copy. Defaults to True.
1018
- """
1019
- copied = deepcopy(self) if deep else copy(self)
1020
- copied.data = [raster.copy() for raster in copied]
1021
- return copied
1022
-
1023
- def _check_for_array(self, text: str = "") -> None:
1024
- mess = "Arrays are not loaded. " + text
1025
- if all(raster.values is None for raster in self):
1026
- raise ValueError(mess)
1027
-
1028
- def __getitem__(
1029
- self,
1030
- item: (
1031
- str
1032
- | slice
1033
- | int
1034
- | Series
1035
- | list
1036
- | tuple
1037
- | Callable
1038
- | Geometry
1039
- | BoundingBox
1040
- ),
1041
- ) -> Self | Raster | TORCHGEO_RETURN_TYPE:
1042
- """Select one or more of the Rasters based on indexing or spatial or boolean predicates.
1043
-
1044
- Examples:
1045
- ------------
1046
- >>> import sgis as sg
1047
- >>> root = 'https://media.githubusercontent.com/media/statisticsnorway/ssb-sgis/main/tests/testdata/raster'
1048
- >>> cube = sg.DataCube.from_root(root, filename_regex=sg.raster.SENTINEL2_FILENAME_REGEX, crs=25833).load()
1049
-
1050
- List slicing:
1051
-
1052
- >>> cube[1:3]
1053
- >>> cube[3:]
1054
-
1055
- Single integer returns a Raster, not a cube.
1056
-
1057
- >>> cube[1]
1058
-
1059
- Boolean conditioning based on cube properties and pandas boolean Series:
1060
-
1061
- >>> cube[(cube.length > 0) & (cube.path.str.contains("FRC_B"))]
1062
- >>> cube[lambda x: (x.length > 0) & (x.path.str.contains("dtm"))]
1063
-
1064
- """
1065
- copy = self.copy()
1066
- if isinstance(item, str) and copy.path is not None:
1067
- copy.data = [raster for raster in copy if item in raster.path]
1068
- if len(copy) == 1:
1069
- return copy[0]
1070
- elif not len(copy):
1071
- return Raster()
1072
- return copy
1073
-
1074
- if isinstance(item, slice):
1075
- copy.data = copy.data[item]
1076
- return copy
1077
- elif isinstance(item, int):
1078
- return copy.data[item]
1079
- elif callable(item):
1080
- item = item(copy)
1081
- elif isinstance(item, BoundingBox):
1082
- return cube_to_torchgeo(self, item)
1083
-
1084
- elif isinstance(item, (GeoDataFrame, GeoSeries, Geometry)) or is_bbox_like(
1085
- item
1086
- ):
1087
- item = to_shapely(item)
1088
- copy.data = [
1089
- raster for raster in copy.data if raster.bounds.intersects(item)
1090
- ]
1091
- return copy
1092
-
1093
- copy.data = [
1094
- raster
1095
- for raster, condition in zip(copy.data, item, strict=True)
1096
- if condition
1097
- ]
1098
-
1099
- return copy
1100
-
1101
- def __setattr__(self, attr: str, value: Any) -> None:
1102
- """Set an attribute of the cube."""
1103
- if (
1104
- attr in ["data", "_data"]
1105
- or not is_list_like(value)
1106
- or not hasattr(self, "data")
1107
- ):
1108
- return super().__setattr__(attr, value)
1109
- if len(value) != len(self.data):
1110
- raise ValueError(
1111
- "custom cube attributes must be scalar or same length as number of rasters. "
1112
- f"Got self.data {len(self)} and new attribute {len(value)}"
1113
- )
1114
- return super().__setattr__(attr, value)
1115
-
1116
- def __iter__(self) -> Iterator[Raster]:
1117
- """Iterate over the Rasters."""
1118
- return iter(self.data)
1119
-
1120
- def __len__(self) -> int:
1121
- """Number of Rasters."""
1122
- return len(self.data)
1123
-
1124
- def __repr__(self) -> str:
1125
- """String representation."""
1126
- return f"{self.__class__.__name__}({len(self)})"
1127
-
1128
- # def __mul__(self, scalar) -> Self:
1129
- # return self.map(_mul, scalar=scalar)
1130
-
1131
- # def __add__(self, scalar) -> Self:
1132
- # return self.map(_add, scalar=scalar)
1133
-
1134
- # def __sub__(self, scalar) -> Self:
1135
- # return self.map(_sub, scalar=scalar)
1136
-
1137
- # def __truediv__(self, scalar) -> Self:
1138
- # return self.map(_truediv, scalar=scalar)
1139
-
1140
- # def __floordiv__(self, scalar) -> Self:
1141
- # return self.map(_floordiv, scalar=scalar)
1142
-
1143
- # def __pow__(self, scalar) -> Self:
1144
- # return self.map(_pow, scalar=scalar)
1145
-
1146
-
1147
- def concat_cubes(cubes: list[DataCube], res: int | None = None) -> DataCube:
1148
- """Concatenate cubes to one.
1149
-
1150
- Args:
1151
- cubes: A sequence of DataCubes.
1152
- res: Spatial resolution.
1153
-
1154
- Returns:
1155
- The cubes combined to one.
1156
- """
1157
- if not all(isinstance(cube, DataCube) for cube in cubes):
1158
- raise TypeError("cubes must be of type DataCube.")
1159
-
1160
- return DataCube(
1161
- list(itertools.chain.from_iterable([cube.data for cube in cubes])), res=res
1162
- )
1163
-
1164
-
1165
- def _clipmerge(cube: DataCube, mask: Any, **kwargs) -> DataCube:
1166
- return _merge(cube, bounds=mask, **kwargs)
1167
-
1168
-
1169
- def _merge(
1170
- cube: DataCube,
1171
- by: str | list[str] | None = None,
1172
- bounds: Any | None = None,
1173
- **kwargs,
1174
- ) -> DataCube:
1175
- if not all(r.values is None for r in cube):
1176
- raise ValueError("Arrays can't be loaded when calling merge.")
1177
-
1178
- bounds = to_bbox(bounds) if bounds is not None else bounds
1179
-
1180
- if by is None:
1181
- return _merge(
1182
- cube,
1183
- bounds=bounds,
1184
- **kwargs,
1185
- )
1186
-
1187
- elif isinstance(by, str):
1188
- by = [by]
1189
- elif not is_list_like(by):
1190
- raise TypeError("'by' should be string or list like.", by)
1191
-
1192
- df = DataFrame(
1193
- {"i": range(len(cube)), "tile": cube.tile} | {x: getattr(cube, x) for x in by}
1194
- )
1195
-
1196
- grouped_indices = df.groupby(by)["i"].unique()
1197
- indices = Series(range(len(cube)))
1198
-
1199
- return concat_cubes(
1200
- [
1201
- _merge(
1202
- cube[indices.isin(idxs)],
1203
- bounds=bounds,
1204
- )
1205
- for idxs in grouped_indices
1206
- ],
1207
- res=cube.res,
1208
- )
1209
-
1210
-
1211
- def _merge_by_bounds(
1212
- cube: DataCube,
1213
- by: str | list[str] | None = None,
1214
- bounds: Any | None = None,
1215
- **kwargs,
1216
- ) -> DataCube:
1217
- if isinstance(by, str):
1218
- by = [by, "tile"]
1219
- elif by is None:
1220
- by = ["tile"]
1221
- else:
1222
- by = list(by) + ["tile"]
1223
-
1224
- return _merge(
1225
- cube,
1226
- by=by,
1227
- bounds=bounds,
1228
- **kwargs,
1229
- )
1230
-
1231
-
1232
- def _merge(cube: DataCube, **kwargs) -> DataCube:
1233
- by = kwargs.pop("by")
1234
- if cube.crs is None:
1235
- cube._crs = get_common_crs(cube.data)
1236
-
1237
- indexes = cube[0].indexes_as_tuple()
1238
-
1239
- datasets = [_load_raster(raster.path) for raster in cube]
1240
- array, transform = rasterio_merge.merge(datasets, indexes=indexes, **kwargs)
1241
- cube.data = [Raster.from_array(array, crs=cube.crs, transform=transform)]
1242
-
1243
- return cube
1244
-
1245
-
1246
- def _load_raster(path: str | Path) -> rasterio.io.DatasetReader:
1247
- with opener(path) as file:
1248
- return rasterio.open(file)
1249
-
1250
-
1251
- def numpy_to_torch(array: np.ndarray) -> torch.Tensor:
1252
- """Convert numpy array to a pytorch tensor."""
1253
- # fix numpy dtypes which are not supported by pytorch tensors
1254
- if array.dtype == np.uint16:
1255
- array = array.astype(np.int32)
1256
- elif array.dtype == np.uint32:
1257
- array = array.astype(np.int64)
1258
-
1259
- return torch.tensor(array)
1260
-
1261
-
1262
- def cube_to_torchgeo(cube: DataCube, query: BoundingBox) -> TORCHGEO_RETURN_TYPE:
1263
- """Convert a DayaCube to the type of dict returned from torchgeo datasets __getitem__."""
1264
- bbox = shapely.box(*to_bbox(query))
1265
- if cube.separate_files:
1266
- cube = cube.sfilter(bbox).explode().load()
1267
- else:
1268
- cube = cube.clipmerge(bbox).explode()
1269
-
1270
- data: torch.Tensor = torch.cat([numpy_to_torch(array) for array in cube.arrays])
1271
-
1272
- key = "image" if cube.is_image else "mask"
1273
- sample = {key: data, "crs": cube.crs, "bbox": query}
1274
- return sample