giga-spatial 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. giga_spatial-0.6.0.dist-info/METADATA +141 -0
  2. giga_spatial-0.6.0.dist-info/RECORD +47 -0
  3. giga_spatial-0.6.0.dist-info/WHEEL +5 -0
  4. giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
  5. giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
  6. gigaspatial/__init__.py +1 -0
  7. gigaspatial/config.py +226 -0
  8. gigaspatial/core/__init__.py +0 -0
  9. gigaspatial/core/io/__init__.py +5 -0
  10. gigaspatial/core/io/adls_data_store.py +325 -0
  11. gigaspatial/core/io/data_api.py +113 -0
  12. gigaspatial/core/io/data_store.py +147 -0
  13. gigaspatial/core/io/local_data_store.py +92 -0
  14. gigaspatial/core/io/readers.py +265 -0
  15. gigaspatial/core/io/writers.py +128 -0
  16. gigaspatial/core/schemas/__init__.py +0 -0
  17. gigaspatial/core/schemas/entity.py +244 -0
  18. gigaspatial/generators/__init__.py +2 -0
  19. gigaspatial/generators/poi.py +636 -0
  20. gigaspatial/generators/zonal/__init__.py +3 -0
  21. gigaspatial/generators/zonal/base.py +370 -0
  22. gigaspatial/generators/zonal/geometry.py +439 -0
  23. gigaspatial/generators/zonal/mercator.py +78 -0
  24. gigaspatial/grid/__init__.py +1 -0
  25. gigaspatial/grid/mercator_tiles.py +286 -0
  26. gigaspatial/handlers/__init__.py +40 -0
  27. gigaspatial/handlers/base.py +761 -0
  28. gigaspatial/handlers/boundaries.py +305 -0
  29. gigaspatial/handlers/ghsl.py +772 -0
  30. gigaspatial/handlers/giga.py +145 -0
  31. gigaspatial/handlers/google_open_buildings.py +472 -0
  32. gigaspatial/handlers/hdx.py +241 -0
  33. gigaspatial/handlers/mapbox_image.py +208 -0
  34. gigaspatial/handlers/maxar_image.py +291 -0
  35. gigaspatial/handlers/microsoft_global_buildings.py +548 -0
  36. gigaspatial/handlers/ookla_speedtest.py +199 -0
  37. gigaspatial/handlers/opencellid.py +290 -0
  38. gigaspatial/handlers/osm.py +356 -0
  39. gigaspatial/handlers/overture.py +126 -0
  40. gigaspatial/handlers/rwi.py +157 -0
  41. gigaspatial/handlers/unicef_georepo.py +806 -0
  42. gigaspatial/handlers/worldpop.py +266 -0
  43. gigaspatial/processing/__init__.py +4 -0
  44. gigaspatial/processing/geo.py +1054 -0
  45. gigaspatial/processing/sat_images.py +39 -0
  46. gigaspatial/processing/tif_processor.py +477 -0
  47. gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,772 @@
1
+ # from dataclasses import dataclass
2
+ from pathlib import Path
3
+ import functools
4
+ import multiprocessing
5
+ from typing import List, Optional, Union, Literal, Iterable, Tuple
6
+ import geopandas as gpd
7
+ import pandas as pd
8
+ import numpy as np
9
+ from pydantic.dataclasses import dataclass
10
+ from shapely.geometry import Point, MultiPoint
11
+ from shapely.geometry.base import BaseGeometry
12
+ from enum import Enum
13
+ import requests
14
+ from tqdm import tqdm
15
+ import zipfile
16
+ import tempfile
17
+ import shutil
18
+ from pydantic import (
19
+ HttpUrl,
20
+ Field,
21
+ model_validator,
22
+ field_validator,
23
+ ConfigDict,
24
+ )
25
+ import logging
26
+
27
+ from gigaspatial.core.io.data_store import DataStore
28
+ from gigaspatial.core.io.local_data_store import LocalDataStore
29
+ from gigaspatial.handlers.boundaries import AdminBoundaries
30
+ from gigaspatial.processing.tif_processor import TifProcessor
31
+ from gigaspatial.handlers.base import (
32
+ BaseHandlerConfig,
33
+ BaseHandlerDownloader,
34
+ BaseHandlerReader,
35
+ BaseHandler,
36
+ )
37
+ from gigaspatial.config import config as global_config
38
+
39
+
40
+ class CoordSystem(int, Enum):
41
+ """Enum for coordinate systems used by GHSL datasets."""
42
+
43
+ WGS84 = 4326
44
+ Mollweide = 54009
45
+
46
+
47
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
48
+ class GHSLDataConfig(BaseHandlerConfig):
49
+ # constants
50
+ AVAILABLE_YEARS: List = Field(default=np.append(np.arange(1975, 2031, 5), 2018))
51
+ AVAILABLE_RESOLUTIONS: List = Field(default=[10, 100, 1000])
52
+
53
+ # base config
54
+ GHSL_DB_BASE_URL: HttpUrl = Field(
55
+ default="https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/"
56
+ )
57
+ TILES_URL: str = "https://ghsl.jrc.ec.europa.eu/download/GHSL_data_{}_shapefile.zip"
58
+
59
+ # user config
60
+ base_path: Path = Field(default=global_config.get_path("ghsl", "bronze"))
61
+ coord_system: CoordSystem = CoordSystem.WGS84
62
+ release: str = "R2023A"
63
+
64
+ product: Literal[
65
+ "GHS_BUILT_S",
66
+ "GHS_BUILT_H_AGBH",
67
+ "GHS_BUILT_H_ANBH",
68
+ "GHS_BUILT_V",
69
+ "GHS_POP",
70
+ "GHS_SMOD",
71
+ ] = Field(...)
72
+ year: int = 2020
73
+ resolution: int = 100
74
+
75
+ def __post_init__(self):
76
+ super().__post_init__()
77
+ self.TILES_URL = self.TILES_URL.format(self.coord_system)
78
+ self._load_tiles()
79
+
80
+ def _load_tiles(self):
81
+ """Load GHSL tiles from tiles shapefile."""
82
+ try:
83
+ self.tiles_gdf = gpd.read_file(self.TILES_URL)
84
+ except Exception as e:
85
+ self.logger.error(f"Failed to download tiles shapefile: {e}")
86
+ raise ValueError(
87
+ f"Could not download GHSL tiles from {self.TILES_URL}"
88
+ ) from e
89
+
90
+ @field_validator("year")
91
+ def validate_year(cls, value: str) -> int:
92
+ if value in cls.AVAILABLE_YEARS:
93
+ return value
94
+ raise ValueError(
95
+ f"No datasets found for the provided year: {value}\nAvailable years are: {cls.AVAILABLE_YEARS}"
96
+ )
97
+
98
+ @field_validator("resolution")
99
+ def validate_resolution(cls, value: str) -> int:
100
+ if value in cls.AVAILABLE_RESOLUTIONS:
101
+ return value
102
+ raise ValueError(
103
+ f"No datasets found for the provided resolution: {value}\nAvailable resolutions are: {cls.AVAILABLE_RESOLUTIONS}"
104
+ )
105
+
106
+ @model_validator(mode="after")
107
+ def validate_configuration(self):
108
+ """
109
+ Validate that the configuration is valid based on dataset availability constraints.
110
+
111
+ Specific rules:
112
+ -
113
+ """
114
+ if self.year == 2018 and self.product in ["GHS_BUILT_V", "GHS_POP", "GHS_SMOD"]:
115
+ raise ValueError(f"{self.product} product is not available for 2018")
116
+
117
+ if self.resolution == 10 and self.product != "GHS_BUILT_H":
118
+ raise ValueError(
119
+ f"{self.product} product is not available at 10 (10m) resolution"
120
+ )
121
+
122
+ if "GHS_BUILT_H" in self.product:
123
+ if self.year != 2018:
124
+ self.logger.warning(
125
+ "Building height product is only available for 2018, year is set as 2018"
126
+ )
127
+ self.year = 2018
128
+
129
+ if self.product == "GHS_BUILT_S":
130
+ if self.year == 2018 and self.resolution != 10:
131
+ self.logger.warning(
132
+ "Built-up surface product for 2018 is only available at 10m resolution, resolution is set as 10m"
133
+ )
134
+ self.resolution = 10
135
+
136
+ if self.resolution == 10 and self.year != 2018:
137
+ self.logger.warning(
138
+ "Built-up surface product at resolution 10 is only available for 2018, year is set as 2018"
139
+ )
140
+ self.year = 2018
141
+
142
+ if self.resolution == 10 and self.coord_system != CoordSystem.Mollweide:
143
+ self.logger.warning(
144
+ f"Built-up surface product at resolution 10 is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
145
+ )
146
+ self.coord_system = CoordSystem.Mollweide
147
+
148
+ if self.product == "GHS_SMOD":
149
+ if self.resolution != 1000:
150
+ self.logger.warning(
151
+ f"Settlement model (SMOD) product is only available at 1000 (1km) resolution, resolution is set as 1000"
152
+ )
153
+ self.resolution = 1000
154
+
155
+ if self.coord_system != CoordSystem.Mollweide:
156
+ self.logger.warning(
157
+ f"Settlement model (SMOD) product is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
158
+ )
159
+ self.coord_system = CoordSystem.Mollweide
160
+
161
+ return self
162
+
163
+ @property
164
+ def crs(self) -> str:
165
+ return "EPSG:4326" if self.coord_system == CoordSystem.WGS84 else "ESRI:54009"
166
+
167
+ def get_relevant_data_units_by_geometry(
168
+ self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
169
+ ) -> List[dict]:
170
+ """
171
+ Return intersecting tiles for a given geometry or GeoDataFrame.
172
+ """
173
+ return self._get_relevant_tiles(geometry)
174
+
175
+ def get_relevant_data_units_by_points(
176
+ self, points: Iterable[Union[Point, tuple]], **kwargs
177
+ ) -> List[dict]:
178
+ """
179
+ Return intersecting tiles for a list of points.
180
+ """
181
+ return self._get_relevant_tiles(points)
182
+
183
+ def get_data_unit_path(self, unit: str = None, file_ext=".zip", **kwargs) -> Path:
184
+ """Construct and return the path for the configured dataset or dataset tile."""
185
+ info = self._get_product_info()
186
+
187
+ tile_path = (
188
+ self.base_path
189
+ / info["product_folder"]
190
+ / (
191
+ f"{info['product_name']}_V{info['product_version']}_0"
192
+ + (f"_{unit}" if unit else "")
193
+ + file_ext
194
+ )
195
+ )
196
+
197
+ return tile_path
198
+
199
+ def compute_dataset_url(self, tile_id=None) -> str:
200
+ """Compute the download URL for a GHSL dataset."""
201
+ info = self._get_product_info()
202
+
203
+ path_segments = [
204
+ str(self.GHSL_DB_BASE_URL),
205
+ info["product_folder"],
206
+ info["product_name"],
207
+ f"V{info['product_version']}-0",
208
+ "tiles" if tile_id else "",
209
+ f"{info['product_name']}_V{info['product_version']}_0"
210
+ + (f"_{tile_id}" if tile_id else "")
211
+ + ".zip",
212
+ ]
213
+
214
+ return "/".join(path_segments)
215
+
216
+ def _get_relevant_tiles(
217
+ self,
218
+ source: Union[
219
+ BaseGeometry,
220
+ gpd.GeoDataFrame,
221
+ Iterable[Union[Point, tuple]],
222
+ ],
223
+ crs="EPSG:4326",
224
+ ) -> list:
225
+ """
226
+ Identify and return the GHSL tiles that spatially intersect with the given geometry.
227
+
228
+ The input geometry can be a Shapely geometry object, a GeoDataFrame,
229
+ or a list of Point objects or (lon, lat) tuples. The method ensures
230
+ the input geometry is in GHSL tiles projection for the spatial intersection.
231
+
232
+ Args:
233
+ source: A Shapely geometry, a GeoDataFrame, or a list of Point
234
+ objects or (lat, lon) tuples representing the area of interest.
235
+
236
+ Returns:
237
+ A list the tile ids for the intersecting tiles.
238
+
239
+ Raises:
240
+ ValueError: If the input `source` is not one of the supported types.
241
+ """
242
+ if isinstance(source, gpd.GeoDataFrame):
243
+ if source.crs != "EPSG:4326":
244
+ source = source.to_crs("EPSG:4326")
245
+ search_geom = source.geometry.unary_union
246
+ elif isinstance(
247
+ source,
248
+ BaseGeometry,
249
+ ):
250
+ search_geom = source
251
+ elif isinstance(source, Iterable) and all(
252
+ len(pt) == 2 or isinstance(pt, Point) for pt in source
253
+ ):
254
+ points = [
255
+ pt if isinstance(pt, Point) else Point(pt[1], pt[0]) for pt in source
256
+ ]
257
+ search_geom = MultiPoint(points)
258
+ else:
259
+ raise ValueError(
260
+ f"Expected Geometry, GeoDataFrame or iterable object of Points got {source.__class__}"
261
+ )
262
+
263
+ if self.tiles_gdf.crs != crs:
264
+ search_geom = (
265
+ gpd.GeoDataFrame(geometry=[search_geom], crs=crs)
266
+ .to_crs(self.tiles_gdf.crs)
267
+ .geometry[0]
268
+ )
269
+
270
+ # Find intersecting tiles
271
+ mask = (
272
+ tile_geom.intersects(search_geom) for tile_geom in self.tiles_gdf.geometry
273
+ )
274
+
275
+ return self.tiles_gdf.loc[mask, "tile_id"].to_list()
276
+
277
+ def _get_product_info(self) -> dict:
278
+ """Generate and return common product information used in multiple methods."""
279
+ resolution_str = (
280
+ str(self.resolution)
281
+ if self.coord_system == CoordSystem.Mollweide
282
+ else ("3ss" if self.resolution == 100 else "30ss")
283
+ )
284
+ product_folder = f"{self.product}_GLOBE_{self.release}"
285
+ product_name = f"{self.product}_E{self.year}_GLOBE_{self.release}_{self.coord_system}_{resolution_str}"
286
+ product_version = 2 if self.product == "GHS_SMOD" else 1
287
+
288
+ return {
289
+ "resolution_str": resolution_str,
290
+ "product_folder": product_folder,
291
+ "product_name": product_name,
292
+ "product_version": product_version,
293
+ }
294
+
295
+ def __repr__(self) -> str:
296
+ """Return a string representation of the GHSL dataset configuration."""
297
+ return (
298
+ f"GHSLDataConfig("
299
+ f"product='{self.product}', "
300
+ f"year={self.year}, "
301
+ f"resolution={self.resolution}, "
302
+ f"coord_system={self.coord_system.name}, "
303
+ f"release='{self.release}'"
304
+ f")"
305
+ )
306
+
307
+
308
+ class GHSLDataDownloader(BaseHandlerDownloader):
309
+ """A class to handle downloads of GHSL datasets."""
310
+
311
+ def __init__(
312
+ self,
313
+ config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
314
+ data_store: Optional[DataStore] = None,
315
+ logger: Optional[logging.Logger] = None,
316
+ ):
317
+ """
318
+ Initialize the downloader.
319
+
320
+ Args:
321
+ config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
322
+ data_store: Optional data storage interface. If not provided, uses LocalDataStore.
323
+ logger: Optional custom logger. If not provided, uses default logger.
324
+ """
325
+ config = (
326
+ config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
327
+ )
328
+ super().__init__(config=config, data_store=data_store, logger=logger)
329
+
330
+ def download_data_unit(
331
+ self,
332
+ tile_id: str,
333
+ extract: bool = True,
334
+ file_pattern: Optional[str] = r".*\.tif$",
335
+ **kwargs,
336
+ ) -> Optional[Union[Path, List[Path]]]:
337
+ """
338
+ Downloads and optionally extracts files for a given tile.
339
+
340
+ Args:
341
+ tile_id: tile ID to process.
342
+ extract: If True and the downloaded file is a zip, extract its contents. Defaults to False.
343
+ file_pattern: Optional regex pattern to filter extracted files (if extract=True).
344
+ **kwargs: Additional parameters passed to download methods
345
+
346
+ Returns:
347
+ Path to the downloaded file if extract=False,
348
+ List of paths to the extracted files if extract=True,
349
+ None on failure.
350
+ """
351
+ url = self.config.compute_dataset_url(tile_id=tile_id)
352
+ output_path = self.config.get_data_unit_path(tile_id)
353
+
354
+ if not extract:
355
+ return self._download_file(url, output_path)
356
+
357
+ extracted_files: List[Path] = []
358
+
359
+ try:
360
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_file:
361
+ downloaded_path = self._download_file(url, Path(temp_file.name))
362
+ if not downloaded_path:
363
+ return None
364
+
365
+ with zipfile.ZipFile(str(downloaded_path), "r") as zip_ref:
366
+ if file_pattern:
367
+ import re
368
+
369
+ pattern = re.compile(file_pattern)
370
+ files_to_extract = [
371
+ f for f in zip_ref.namelist() if pattern.match(f)
372
+ ]
373
+ else:
374
+ files_to_extract = zip_ref.namelist()
375
+
376
+ for file in files_to_extract:
377
+ extracted_path = output_path.parent / Path(file).name
378
+ with zip_ref.open(file) as source, open(
379
+ extracted_path, "wb"
380
+ ) as target:
381
+ shutil.copyfileobj(source, target)
382
+ extracted_files.append(extracted_path)
383
+ self.logger.info(f"Extracted {file} to {extracted_path}")
384
+
385
+ Path(temp_file.name).unlink()
386
+ return extracted_files
387
+
388
+ except Exception as e:
389
+ self.logger.error(f"Error downloading/extracting tile {tile_id}: {e}")
390
+ return None
391
+
392
+ def download_data_units(
393
+ self,
394
+ tile_ids: List[str],
395
+ extract: bool = True,
396
+ file_pattern: Optional[str] = r".*\.tif$",
397
+ **kwargs,
398
+ ) -> List[Optional[Union[Path, List[Path]]]]:
399
+ """
400
+ Downloads multiple tiles in parallel, with an option to extract them.
401
+
402
+ Args:
403
+ tile_ids: A list of tile IDs to download.
404
+ extract: If True and the downloaded files are zips, extract their contents. Defaults to False.
405
+ file_pattern: Optional regex pattern to filter extracted files (if extract=True).
406
+ **kwargs: Additional parameters passed to download methods
407
+
408
+ Returns:
409
+ A list where each element corresponds to a tile ID and contains:
410
+ - Path to the downloaded file if extract=False.
411
+ - List of paths to extracted files if extract=True.
412
+ - None if the download or extraction failed for a tile.
413
+ """
414
+ if not tile_ids:
415
+ self.logger.warning("No tiles to download")
416
+ return []
417
+
418
+ with multiprocessing.Pool(processes=self.config.n_workers) as pool:
419
+ download_func = functools.partial(
420
+ self.download_data_unit, extract=extract, file_pattern=file_pattern
421
+ )
422
+ file_paths = list(
423
+ tqdm(
424
+ pool.imap(download_func, tile_ids),
425
+ total=len(tile_ids),
426
+ desc=f"Downloading data",
427
+ )
428
+ )
429
+
430
+ return file_paths
431
+
432
+ def download(
433
+ self,
434
+ source: Union[
435
+ str, # country
436
+ List[Union[Tuple[float, float], Point]], # points
437
+ BaseGeometry, # shapely geoms
438
+ gpd.GeoDataFrame,
439
+ ],
440
+ extract: bool = True,
441
+ file_pattern: Optional[str] = r".*\.tif$",
442
+ **kwargs,
443
+ ) -> List[Optional[Union[Path, List[Path]]]]:
444
+ """
445
+ Download GHSL data for a specified geographic region.
446
+
447
+ The region can be defined by a country code/name, a list of points,
448
+ a Shapely geometry, or a GeoDataFrame. This method identifies the
449
+ relevant GHSL tiles intersecting the region and downloads the
450
+ specified type of data (polygons or points) for those tiles in parallel.
451
+
452
+ Args:
453
+ source: Defines the geographic area for which to download data.
454
+ Can be:
455
+ - A string representing a country code or name.
456
+ - A list of (latitude, longitude) tuples or Shapely Point objects.
457
+ - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
458
+ - A GeoDataFrame with geometry column in EPSG:4326.
459
+ extract: If True and the downloaded files are zips, extract their contents. Defaults to False.
460
+ file_pattern: Optional regex pattern to filter extracted files (if extract=True).
461
+ **kwargs: Additional keyword arguments. These will be passed down to
462
+ `AdminBoundaries.create()` (if `source` is a country)
463
+ and to `self.download_data_units()`.
464
+
465
+ Returns:
466
+ A list of local file paths for the successfully downloaded tiles.
467
+ Returns an empty list if no data is found for the region or if
468
+ all downloads fail.
469
+ """
470
+
471
+ tiles = self.config.get_relevant_data_units(source, **kwargs)
472
+ return self.download_data_units(
473
+ tiles, extract=extract, file_pattern=file_pattern, **kwargs
474
+ )
475
+
476
+ def download_by_country(
477
+ self,
478
+ country_code: str,
479
+ data_store: Optional[DataStore] = None,
480
+ country_geom_path: Optional[Union[str, Path]] = None,
481
+ extract: bool = True,
482
+ file_pattern: Optional[str] = r".*\.tif$",
483
+ **kwargs,
484
+ ) -> List[Optional[Union[Path, List[Path]]]]:
485
+ """
486
+ Download GHSL data for a specific country.
487
+
488
+ This is a convenience method to download data for an entire country
489
+ using its code or name.
490
+
491
+ Args:
492
+ country_code: The country code (e.g., 'USA', 'GBR') or name.
493
+ data_store: Optional instance of a `DataStore` to be used by
494
+ `AdminBoundaries` for loading country boundaries. If None,
495
+ `AdminBoundaries` will use its default data loading.
496
+ country_geom_path: Optional path to a GeoJSON file containing the
497
+ country boundary. If provided, this boundary is used
498
+ instead of the default from `AdminBoundaries`.
499
+ extract: If True and the downloaded files are zips, extract their contents. Defaults to False.
500
+ file_pattern: Optional regex pattern to filter extracted files (if extract=True).
501
+ **kwargs: Additional keyword arguments that are passed to
502
+ `download_data_units`. For example, `extract` to download and extract.
503
+
504
+ Returns:
505
+ A list of local file paths for the successfully downloaded tiles
506
+ for the specified country.
507
+ """
508
+ return self.download(
509
+ source=country_code,
510
+ data_store=data_store,
511
+ path=country_geom_path,
512
+ extract=extract,
513
+ file_pattern=file_pattern,
514
+ **kwargs,
515
+ )
516
+
517
+ def _download_file(self, url: str, output_path: Path) -> Optional[Path]:
518
+ """
519
+ Downloads a file from a URL to a specified output path with a progress bar.
520
+
521
+ Args:
522
+ url: The URL to download from.
523
+ output_path: The local path to save the downloaded file.
524
+
525
+ Returns:
526
+ The path to the downloaded file on success, None on failure.
527
+ """
528
+ try:
529
+ response = requests.get(url, stream=True)
530
+ response.raise_for_status()
531
+
532
+ total_size = int(response.headers.get("content-length", 0))
533
+
534
+ with self.data_store.open(str(output_path), "wb") as file:
535
+ with tqdm(
536
+ total=total_size,
537
+ unit="B",
538
+ unit_scale=True,
539
+ desc=f"Downloading {output_path.name}",
540
+ ) as pbar:
541
+ for chunk in response.iter_content(chunk_size=8192):
542
+ if chunk:
543
+ file.write(chunk)
544
+ pbar.update(len(chunk))
545
+
546
+ self.logger.debug(f"Successfully downloaded: {url} to {output_path}")
547
+ return output_path
548
+
549
+ except requests.exceptions.RequestException as e:
550
+ self.logger.error(f"Failed to download {url}: {str(e)}")
551
+ return None
552
+ except Exception as e:
553
+ self.logger.error(f"Unexpected error downloading {url}: {str(e)}")
554
+ return None
555
+
556
+
557
+ class GHSLDataReader(BaseHandlerReader):
558
+
559
+ def __init__(
560
+ self,
561
+ config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
562
+ data_store: Optional[DataStore] = None,
563
+ logger: Optional[logging.Logger] = None,
564
+ ):
565
+ """
566
+ Initialize the downloader.
567
+
568
+ Args:
569
+ config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
570
+ data_store: Optional data storage interface. If not provided, uses LocalDataStore.
571
+ logger: Optional custom logger. If not provided, uses default logger.
572
+ """
573
+ config = (
574
+ config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
575
+ )
576
+ super().__init__(config=config, data_store=data_store, logger=logger)
577
+
578
+ def load_from_paths(
579
+ self, source_data_path: List[Union[str, Path]], **kwargs
580
+ ) -> List[TifProcessor]:
581
+ """
582
+ Load TifProcessors from GHSL dataset.
583
+ Args:
584
+ source_data_path: List of file paths to load
585
+ Returns:
586
+ List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
587
+ """
588
+ return self._load_raster_data(raster_paths=source_data_path)
589
+
590
+ def load(self, source, **kwargs):
591
+ return super().load(source=source, file_ext=".tif")
592
+
593
+
594
+ class GHSLDataHandler(BaseHandler):
595
+ """
596
+ Handler for GHSL (Global Human Settlement Layer) dataset.
597
+
598
+ This class provides a unified interface for downloading and loading GHSL data.
599
+ It manages the lifecycle of configuration, downloading, and reading components.
600
+ """
601
+
602
+ def __init__(
603
+ self,
604
+ product: Literal[
605
+ "GHS_BUILT_S",
606
+ "GHS_BUILT_H_AGBH",
607
+ "GHS_BUILT_H_ANBH",
608
+ "GHS_BUILT_V",
609
+ "GHS_POP",
610
+ "GHS_SMOD",
611
+ ],
612
+ year: int = 2020,
613
+ resolution: int = 100,
614
+ config: Optional[GHSLDataConfig] = None,
615
+ downloader: Optional[GHSLDataDownloader] = None,
616
+ reader: Optional[GHSLDataReader] = None,
617
+ data_store: Optional[DataStore] = None,
618
+ logger: Optional[logging.Logger] = None,
619
+ **kwargs,
620
+ ):
621
+ """
622
+ Initialize the GHSLDataHandler.
623
+
624
+ Args:
625
+ product: The GHSL product to use. Must be one of:
626
+ - GHS_BUILT_S: Built-up surface
627
+ - GHS_BUILT_H_AGBH: Average building height
628
+ - GHS_BUILT_H_ANBH: Average number of building heights
629
+ - GHS_BUILT_V: Building volume
630
+ - GHS_POP: Population
631
+ - GHS_SMOD: Settlement model
632
+ year: The year of the data (default: 2020)
633
+ resolution: The resolution in meters (default: 100)
634
+ config: Optional configuration object
635
+ downloader: Optional downloader instance
636
+ reader: Optional reader instance
637
+ data_store: Optional data store instance
638
+ logger: Optional logger instance
639
+ **kwargs: Additional configuration parameters
640
+ """
641
+ self._product = product
642
+ self._year = year
643
+ self._resolution = resolution
644
+ super().__init__(
645
+ config=config,
646
+ downloader=downloader,
647
+ reader=reader,
648
+ data_store=data_store,
649
+ logger=logger,
650
+ )
651
+
652
+ def create_config(
653
+ self, data_store: DataStore, logger: logging.Logger, **kwargs
654
+ ) -> GHSLDataConfig:
655
+ """
656
+ Create and return a GHSLDataConfig instance.
657
+
658
+ Args:
659
+ data_store: The data store instance to use
660
+ logger: The logger instance to use
661
+ **kwargs: Additional configuration parameters
662
+
663
+ Returns:
664
+ Configured GHSLDataConfig instance
665
+ """
666
+ return GHSLDataConfig(
667
+ product=self._product,
668
+ year=self._year,
669
+ resolution=self._resolution,
670
+ data_store=data_store,
671
+ logger=logger,
672
+ **kwargs,
673
+ )
674
+
675
+ def create_downloader(
676
+ self,
677
+ config: GHSLDataConfig,
678
+ data_store: DataStore,
679
+ logger: logging.Logger,
680
+ **kwargs,
681
+ ) -> GHSLDataDownloader:
682
+ """
683
+ Create and return a GHSLDataDownloader instance.
684
+
685
+ Args:
686
+ config: The configuration object
687
+ data_store: The data store instance to use
688
+ logger: The logger instance to use
689
+ **kwargs: Additional downloader parameters
690
+
691
+ Returns:
692
+ Configured GHSLDataDownloader instance
693
+ """
694
+ return GHSLDataDownloader(
695
+ config=config, data_store=data_store, logger=logger, **kwargs
696
+ )
697
+
698
+ def create_reader(
699
+ self,
700
+ config: GHSLDataConfig,
701
+ data_store: DataStore,
702
+ logger: logging.Logger,
703
+ **kwargs,
704
+ ) -> GHSLDataReader:
705
+ """
706
+ Create and return a GHSLDataReader instance.
707
+
708
+ Args:
709
+ config: The configuration object
710
+ data_store: The data store instance to use
711
+ logger: The logger instance to use
712
+ **kwargs: Additional reader parameters
713
+
714
+ Returns:
715
+ Configured GHSLDataReader instance
716
+ """
717
+ return GHSLDataReader(
718
+ config=config, data_store=data_store, logger=logger, **kwargs
719
+ )
720
+
721
+ def load_data(
722
+ self,
723
+ source: Union[
724
+ str, # country
725
+ List[Union[tuple, Point]], # points
726
+ BaseGeometry, # geometry
727
+ gpd.GeoDataFrame, # geodataframe
728
+ Path, # path
729
+ List[Union[str, Path]], # list of paths
730
+ ],
731
+ ensure_available: bool = True,
732
+ **kwargs,
733
+ ):
734
+ return super().load_data(
735
+ source=source,
736
+ ensure_available=ensure_available,
737
+ file_ext=".tif",
738
+ extract=True,
739
+ file_pattern=r".*\.tif$",
740
+ **kwargs,
741
+ )
742
+
743
+ def load_into_dataframe(
744
+ self,
745
+ source: Union[
746
+ str, # country
747
+ List[Union[tuple, Point]], # points
748
+ BaseGeometry, # geometry
749
+ gpd.GeoDataFrame, # geodataframe
750
+ Path, # path
751
+ List[Union[str, Path]], # list of paths
752
+ ],
753
+ ensure_available: bool = True,
754
+ **kwargs,
755
+ ) -> pd.DataFrame:
756
+ """
757
+ Load GHSL data into a pandas DataFrame.
758
+
759
+ Args:
760
+ source: The data source specification
761
+ ensure_available: If True, ensure data is downloaded before loading
762
+ **kwargs: Additional parameters passed to load methods
763
+
764
+ Returns:
765
+ DataFrame containing the GHSL data
766
+ """
767
+ tif_processors = self.load_data(
768
+ source=source, ensure_available=ensure_available, **kwargs
769
+ )
770
+ return pd.concat(
771
+ [tp.to_dataframe() for tp in tif_processors], ignore_index=True
772
+ )