giga-spatial 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- giga_spatial-0.6.0.dist-info/METADATA +141 -0
- giga_spatial-0.6.0.dist-info/RECORD +47 -0
- giga_spatial-0.6.0.dist-info/WHEEL +5 -0
- giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
- giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
- gigaspatial/__init__.py +1 -0
- gigaspatial/config.py +226 -0
- gigaspatial/core/__init__.py +0 -0
- gigaspatial/core/io/__init__.py +5 -0
- gigaspatial/core/io/adls_data_store.py +325 -0
- gigaspatial/core/io/data_api.py +113 -0
- gigaspatial/core/io/data_store.py +147 -0
- gigaspatial/core/io/local_data_store.py +92 -0
- gigaspatial/core/io/readers.py +265 -0
- gigaspatial/core/io/writers.py +128 -0
- gigaspatial/core/schemas/__init__.py +0 -0
- gigaspatial/core/schemas/entity.py +244 -0
- gigaspatial/generators/__init__.py +2 -0
- gigaspatial/generators/poi.py +636 -0
- gigaspatial/generators/zonal/__init__.py +3 -0
- gigaspatial/generators/zonal/base.py +370 -0
- gigaspatial/generators/zonal/geometry.py +439 -0
- gigaspatial/generators/zonal/mercator.py +78 -0
- gigaspatial/grid/__init__.py +1 -0
- gigaspatial/grid/mercator_tiles.py +286 -0
- gigaspatial/handlers/__init__.py +40 -0
- gigaspatial/handlers/base.py +761 -0
- gigaspatial/handlers/boundaries.py +305 -0
- gigaspatial/handlers/ghsl.py +772 -0
- gigaspatial/handlers/giga.py +145 -0
- gigaspatial/handlers/google_open_buildings.py +472 -0
- gigaspatial/handlers/hdx.py +241 -0
- gigaspatial/handlers/mapbox_image.py +208 -0
- gigaspatial/handlers/maxar_image.py +291 -0
- gigaspatial/handlers/microsoft_global_buildings.py +548 -0
- gigaspatial/handlers/ookla_speedtest.py +199 -0
- gigaspatial/handlers/opencellid.py +290 -0
- gigaspatial/handlers/osm.py +356 -0
- gigaspatial/handlers/overture.py +126 -0
- gigaspatial/handlers/rwi.py +157 -0
- gigaspatial/handlers/unicef_georepo.py +806 -0
- gigaspatial/handlers/worldpop.py +266 -0
- gigaspatial/processing/__init__.py +4 -0
- gigaspatial/processing/geo.py +1054 -0
- gigaspatial/processing/sat_images.py +39 -0
- gigaspatial/processing/tif_processor.py +477 -0
- gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,772 @@
|
|
1
|
+
# from dataclasses import dataclass
|
2
|
+
from pathlib import Path
|
3
|
+
import functools
|
4
|
+
import multiprocessing
|
5
|
+
from typing import List, Optional, Union, Literal, Iterable, Tuple
|
6
|
+
import geopandas as gpd
|
7
|
+
import pandas as pd
|
8
|
+
import numpy as np
|
9
|
+
from pydantic.dataclasses import dataclass
|
10
|
+
from shapely.geometry import Point, MultiPoint
|
11
|
+
from shapely.geometry.base import BaseGeometry
|
12
|
+
from enum import Enum
|
13
|
+
import requests
|
14
|
+
from tqdm import tqdm
|
15
|
+
import zipfile
|
16
|
+
import tempfile
|
17
|
+
import shutil
|
18
|
+
from pydantic import (
|
19
|
+
HttpUrl,
|
20
|
+
Field,
|
21
|
+
model_validator,
|
22
|
+
field_validator,
|
23
|
+
ConfigDict,
|
24
|
+
)
|
25
|
+
import logging
|
26
|
+
|
27
|
+
from gigaspatial.core.io.data_store import DataStore
|
28
|
+
from gigaspatial.core.io.local_data_store import LocalDataStore
|
29
|
+
from gigaspatial.handlers.boundaries import AdminBoundaries
|
30
|
+
from gigaspatial.processing.tif_processor import TifProcessor
|
31
|
+
from gigaspatial.handlers.base import (
|
32
|
+
BaseHandlerConfig,
|
33
|
+
BaseHandlerDownloader,
|
34
|
+
BaseHandlerReader,
|
35
|
+
BaseHandler,
|
36
|
+
)
|
37
|
+
from gigaspatial.config import config as global_config
|
38
|
+
|
39
|
+
|
40
|
+
class CoordSystem(int, Enum):
|
41
|
+
"""Enum for coordinate systems used by GHSL datasets."""
|
42
|
+
|
43
|
+
WGS84 = 4326
|
44
|
+
Mollweide = 54009
|
45
|
+
|
46
|
+
|
47
|
+
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
48
|
+
class GHSLDataConfig(BaseHandlerConfig):
|
49
|
+
# constants
|
50
|
+
AVAILABLE_YEARS: List = Field(default=np.append(np.arange(1975, 2031, 5), 2018))
|
51
|
+
AVAILABLE_RESOLUTIONS: List = Field(default=[10, 100, 1000])
|
52
|
+
|
53
|
+
# base config
|
54
|
+
GHSL_DB_BASE_URL: HttpUrl = Field(
|
55
|
+
default="https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/"
|
56
|
+
)
|
57
|
+
TILES_URL: str = "https://ghsl.jrc.ec.europa.eu/download/GHSL_data_{}_shapefile.zip"
|
58
|
+
|
59
|
+
# user config
|
60
|
+
base_path: Path = Field(default=global_config.get_path("ghsl", "bronze"))
|
61
|
+
coord_system: CoordSystem = CoordSystem.WGS84
|
62
|
+
release: str = "R2023A"
|
63
|
+
|
64
|
+
product: Literal[
|
65
|
+
"GHS_BUILT_S",
|
66
|
+
"GHS_BUILT_H_AGBH",
|
67
|
+
"GHS_BUILT_H_ANBH",
|
68
|
+
"GHS_BUILT_V",
|
69
|
+
"GHS_POP",
|
70
|
+
"GHS_SMOD",
|
71
|
+
] = Field(...)
|
72
|
+
year: int = 2020
|
73
|
+
resolution: int = 100
|
74
|
+
|
75
|
+
def __post_init__(self):
|
76
|
+
super().__post_init__()
|
77
|
+
self.TILES_URL = self.TILES_URL.format(self.coord_system)
|
78
|
+
self._load_tiles()
|
79
|
+
|
80
|
+
def _load_tiles(self):
|
81
|
+
"""Load GHSL tiles from tiles shapefile."""
|
82
|
+
try:
|
83
|
+
self.tiles_gdf = gpd.read_file(self.TILES_URL)
|
84
|
+
except Exception as e:
|
85
|
+
self.logger.error(f"Failed to download tiles shapefile: {e}")
|
86
|
+
raise ValueError(
|
87
|
+
f"Could not download GHSL tiles from {self.TILES_URL}"
|
88
|
+
) from e
|
89
|
+
|
90
|
+
@field_validator("year")
|
91
|
+
def validate_year(cls, value: str) -> int:
|
92
|
+
if value in cls.AVAILABLE_YEARS:
|
93
|
+
return value
|
94
|
+
raise ValueError(
|
95
|
+
f"No datasets found for the provided year: {value}\nAvailable years are: {cls.AVAILABLE_YEARS}"
|
96
|
+
)
|
97
|
+
|
98
|
+
@field_validator("resolution")
|
99
|
+
def validate_resolution(cls, value: str) -> int:
|
100
|
+
if value in cls.AVAILABLE_RESOLUTIONS:
|
101
|
+
return value
|
102
|
+
raise ValueError(
|
103
|
+
f"No datasets found for the provided resolution: {value}\nAvailable resolutions are: {cls.AVAILABLE_RESOLUTIONS}"
|
104
|
+
)
|
105
|
+
|
106
|
+
@model_validator(mode="after")
|
107
|
+
def validate_configuration(self):
|
108
|
+
"""
|
109
|
+
Validate that the configuration is valid based on dataset availability constraints.
|
110
|
+
|
111
|
+
Specific rules:
|
112
|
+
-
|
113
|
+
"""
|
114
|
+
if self.year == 2018 and self.product in ["GHS_BUILT_V", "GHS_POP", "GHS_SMOD"]:
|
115
|
+
raise ValueError(f"{self.product} product is not available for 2018")
|
116
|
+
|
117
|
+
if self.resolution == 10 and self.product != "GHS_BUILT_H":
|
118
|
+
raise ValueError(
|
119
|
+
f"{self.product} product is not available at 10 (10m) resolution"
|
120
|
+
)
|
121
|
+
|
122
|
+
if "GHS_BUILT_H" in self.product:
|
123
|
+
if self.year != 2018:
|
124
|
+
self.logger.warning(
|
125
|
+
"Building height product is only available for 2018, year is set as 2018"
|
126
|
+
)
|
127
|
+
self.year = 2018
|
128
|
+
|
129
|
+
if self.product == "GHS_BUILT_S":
|
130
|
+
if self.year == 2018 and self.resolution != 10:
|
131
|
+
self.logger.warning(
|
132
|
+
"Built-up surface product for 2018 is only available at 10m resolution, resolution is set as 10m"
|
133
|
+
)
|
134
|
+
self.resolution = 10
|
135
|
+
|
136
|
+
if self.resolution == 10 and self.year != 2018:
|
137
|
+
self.logger.warning(
|
138
|
+
"Built-up surface product at resolution 10 is only available for 2018, year is set as 2018"
|
139
|
+
)
|
140
|
+
self.year = 2018
|
141
|
+
|
142
|
+
if self.resolution == 10 and self.coord_system != CoordSystem.Mollweide:
|
143
|
+
self.logger.warning(
|
144
|
+
f"Built-up surface product at resolution 10 is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
|
145
|
+
)
|
146
|
+
self.coord_system = CoordSystem.Mollweide
|
147
|
+
|
148
|
+
if self.product == "GHS_SMOD":
|
149
|
+
if self.resolution != 1000:
|
150
|
+
self.logger.warning(
|
151
|
+
f"Settlement model (SMOD) product is only available at 1000 (1km) resolution, resolution is set as 1000"
|
152
|
+
)
|
153
|
+
self.resolution = 1000
|
154
|
+
|
155
|
+
if self.coord_system != CoordSystem.Mollweide:
|
156
|
+
self.logger.warning(
|
157
|
+
f"Settlement model (SMOD) product is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
|
158
|
+
)
|
159
|
+
self.coord_system = CoordSystem.Mollweide
|
160
|
+
|
161
|
+
return self
|
162
|
+
|
163
|
+
@property
|
164
|
+
def crs(self) -> str:
|
165
|
+
return "EPSG:4326" if self.coord_system == CoordSystem.WGS84 else "ESRI:54009"
|
166
|
+
|
167
|
+
def get_relevant_data_units_by_geometry(
|
168
|
+
self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
|
169
|
+
) -> List[dict]:
|
170
|
+
"""
|
171
|
+
Return intersecting tiles for a given geometry or GeoDataFrame.
|
172
|
+
"""
|
173
|
+
return self._get_relevant_tiles(geometry)
|
174
|
+
|
175
|
+
def get_relevant_data_units_by_points(
|
176
|
+
self, points: Iterable[Union[Point, tuple]], **kwargs
|
177
|
+
) -> List[dict]:
|
178
|
+
"""
|
179
|
+
Return intersecting tiles for a list of points.
|
180
|
+
"""
|
181
|
+
return self._get_relevant_tiles(points)
|
182
|
+
|
183
|
+
def get_data_unit_path(self, unit: str = None, file_ext=".zip", **kwargs) -> Path:
|
184
|
+
"""Construct and return the path for the configured dataset or dataset tile."""
|
185
|
+
info = self._get_product_info()
|
186
|
+
|
187
|
+
tile_path = (
|
188
|
+
self.base_path
|
189
|
+
/ info["product_folder"]
|
190
|
+
/ (
|
191
|
+
f"{info['product_name']}_V{info['product_version']}_0"
|
192
|
+
+ (f"_{unit}" if unit else "")
|
193
|
+
+ file_ext
|
194
|
+
)
|
195
|
+
)
|
196
|
+
|
197
|
+
return tile_path
|
198
|
+
|
199
|
+
def compute_dataset_url(self, tile_id=None) -> str:
|
200
|
+
"""Compute the download URL for a GHSL dataset."""
|
201
|
+
info = self._get_product_info()
|
202
|
+
|
203
|
+
path_segments = [
|
204
|
+
str(self.GHSL_DB_BASE_URL),
|
205
|
+
info["product_folder"],
|
206
|
+
info["product_name"],
|
207
|
+
f"V{info['product_version']}-0",
|
208
|
+
"tiles" if tile_id else "",
|
209
|
+
f"{info['product_name']}_V{info['product_version']}_0"
|
210
|
+
+ (f"_{tile_id}" if tile_id else "")
|
211
|
+
+ ".zip",
|
212
|
+
]
|
213
|
+
|
214
|
+
return "/".join(path_segments)
|
215
|
+
|
216
|
+
def _get_relevant_tiles(
|
217
|
+
self,
|
218
|
+
source: Union[
|
219
|
+
BaseGeometry,
|
220
|
+
gpd.GeoDataFrame,
|
221
|
+
Iterable[Union[Point, tuple]],
|
222
|
+
],
|
223
|
+
crs="EPSG:4326",
|
224
|
+
) -> list:
|
225
|
+
"""
|
226
|
+
Identify and return the GHSL tiles that spatially intersect with the given geometry.
|
227
|
+
|
228
|
+
The input geometry can be a Shapely geometry object, a GeoDataFrame,
|
229
|
+
or a list of Point objects or (lon, lat) tuples. The method ensures
|
230
|
+
the input geometry is in GHSL tiles projection for the spatial intersection.
|
231
|
+
|
232
|
+
Args:
|
233
|
+
source: A Shapely geometry, a GeoDataFrame, or a list of Point
|
234
|
+
objects or (lat, lon) tuples representing the area of interest.
|
235
|
+
|
236
|
+
Returns:
|
237
|
+
A list the tile ids for the intersecting tiles.
|
238
|
+
|
239
|
+
Raises:
|
240
|
+
ValueError: If the input `source` is not one of the supported types.
|
241
|
+
"""
|
242
|
+
if isinstance(source, gpd.GeoDataFrame):
|
243
|
+
if source.crs != "EPSG:4326":
|
244
|
+
source = source.to_crs("EPSG:4326")
|
245
|
+
search_geom = source.geometry.unary_union
|
246
|
+
elif isinstance(
|
247
|
+
source,
|
248
|
+
BaseGeometry,
|
249
|
+
):
|
250
|
+
search_geom = source
|
251
|
+
elif isinstance(source, Iterable) and all(
|
252
|
+
len(pt) == 2 or isinstance(pt, Point) for pt in source
|
253
|
+
):
|
254
|
+
points = [
|
255
|
+
pt if isinstance(pt, Point) else Point(pt[1], pt[0]) for pt in source
|
256
|
+
]
|
257
|
+
search_geom = MultiPoint(points)
|
258
|
+
else:
|
259
|
+
raise ValueError(
|
260
|
+
f"Expected Geometry, GeoDataFrame or iterable object of Points got {source.__class__}"
|
261
|
+
)
|
262
|
+
|
263
|
+
if self.tiles_gdf.crs != crs:
|
264
|
+
search_geom = (
|
265
|
+
gpd.GeoDataFrame(geometry=[search_geom], crs=crs)
|
266
|
+
.to_crs(self.tiles_gdf.crs)
|
267
|
+
.geometry[0]
|
268
|
+
)
|
269
|
+
|
270
|
+
# Find intersecting tiles
|
271
|
+
mask = (
|
272
|
+
tile_geom.intersects(search_geom) for tile_geom in self.tiles_gdf.geometry
|
273
|
+
)
|
274
|
+
|
275
|
+
return self.tiles_gdf.loc[mask, "tile_id"].to_list()
|
276
|
+
|
277
|
+
def _get_product_info(self) -> dict:
|
278
|
+
"""Generate and return common product information used in multiple methods."""
|
279
|
+
resolution_str = (
|
280
|
+
str(self.resolution)
|
281
|
+
if self.coord_system == CoordSystem.Mollweide
|
282
|
+
else ("3ss" if self.resolution == 100 else "30ss")
|
283
|
+
)
|
284
|
+
product_folder = f"{self.product}_GLOBE_{self.release}"
|
285
|
+
product_name = f"{self.product}_E{self.year}_GLOBE_{self.release}_{self.coord_system}_{resolution_str}"
|
286
|
+
product_version = 2 if self.product == "GHS_SMOD" else 1
|
287
|
+
|
288
|
+
return {
|
289
|
+
"resolution_str": resolution_str,
|
290
|
+
"product_folder": product_folder,
|
291
|
+
"product_name": product_name,
|
292
|
+
"product_version": product_version,
|
293
|
+
}
|
294
|
+
|
295
|
+
def __repr__(self) -> str:
|
296
|
+
"""Return a string representation of the GHSL dataset configuration."""
|
297
|
+
return (
|
298
|
+
f"GHSLDataConfig("
|
299
|
+
f"product='{self.product}', "
|
300
|
+
f"year={self.year}, "
|
301
|
+
f"resolution={self.resolution}, "
|
302
|
+
f"coord_system={self.coord_system.name}, "
|
303
|
+
f"release='{self.release}'"
|
304
|
+
f")"
|
305
|
+
)
|
306
|
+
|
307
|
+
|
308
|
+
class GHSLDataDownloader(BaseHandlerDownloader):
|
309
|
+
"""A class to handle downloads of GHSL datasets."""
|
310
|
+
|
311
|
+
def __init__(
|
312
|
+
self,
|
313
|
+
config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
|
314
|
+
data_store: Optional[DataStore] = None,
|
315
|
+
logger: Optional[logging.Logger] = None,
|
316
|
+
):
|
317
|
+
"""
|
318
|
+
Initialize the downloader.
|
319
|
+
|
320
|
+
Args:
|
321
|
+
config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
|
322
|
+
data_store: Optional data storage interface. If not provided, uses LocalDataStore.
|
323
|
+
logger: Optional custom logger. If not provided, uses default logger.
|
324
|
+
"""
|
325
|
+
config = (
|
326
|
+
config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
|
327
|
+
)
|
328
|
+
super().__init__(config=config, data_store=data_store, logger=logger)
|
329
|
+
|
330
|
+
def download_data_unit(
|
331
|
+
self,
|
332
|
+
tile_id: str,
|
333
|
+
extract: bool = True,
|
334
|
+
file_pattern: Optional[str] = r".*\.tif$",
|
335
|
+
**kwargs,
|
336
|
+
) -> Optional[Union[Path, List[Path]]]:
|
337
|
+
"""
|
338
|
+
Downloads and optionally extracts files for a given tile.
|
339
|
+
|
340
|
+
Args:
|
341
|
+
tile_id: tile ID to process.
|
342
|
+
extract: If True and the downloaded file is a zip, extract its contents. Defaults to False.
|
343
|
+
file_pattern: Optional regex pattern to filter extracted files (if extract=True).
|
344
|
+
**kwargs: Additional parameters passed to download methods
|
345
|
+
|
346
|
+
Returns:
|
347
|
+
Path to the downloaded file if extract=False,
|
348
|
+
List of paths to the extracted files if extract=True,
|
349
|
+
None on failure.
|
350
|
+
"""
|
351
|
+
url = self.config.compute_dataset_url(tile_id=tile_id)
|
352
|
+
output_path = self.config.get_data_unit_path(tile_id)
|
353
|
+
|
354
|
+
if not extract:
|
355
|
+
return self._download_file(url, output_path)
|
356
|
+
|
357
|
+
extracted_files: List[Path] = []
|
358
|
+
|
359
|
+
try:
|
360
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_file:
|
361
|
+
downloaded_path = self._download_file(url, Path(temp_file.name))
|
362
|
+
if not downloaded_path:
|
363
|
+
return None
|
364
|
+
|
365
|
+
with zipfile.ZipFile(str(downloaded_path), "r") as zip_ref:
|
366
|
+
if file_pattern:
|
367
|
+
import re
|
368
|
+
|
369
|
+
pattern = re.compile(file_pattern)
|
370
|
+
files_to_extract = [
|
371
|
+
f for f in zip_ref.namelist() if pattern.match(f)
|
372
|
+
]
|
373
|
+
else:
|
374
|
+
files_to_extract = zip_ref.namelist()
|
375
|
+
|
376
|
+
for file in files_to_extract:
|
377
|
+
extracted_path = output_path.parent / Path(file).name
|
378
|
+
with zip_ref.open(file) as source, open(
|
379
|
+
extracted_path, "wb"
|
380
|
+
) as target:
|
381
|
+
shutil.copyfileobj(source, target)
|
382
|
+
extracted_files.append(extracted_path)
|
383
|
+
self.logger.info(f"Extracted {file} to {extracted_path}")
|
384
|
+
|
385
|
+
Path(temp_file.name).unlink()
|
386
|
+
return extracted_files
|
387
|
+
|
388
|
+
except Exception as e:
|
389
|
+
self.logger.error(f"Error downloading/extracting tile {tile_id}: {e}")
|
390
|
+
return None
|
391
|
+
|
392
|
+
def download_data_units(
|
393
|
+
self,
|
394
|
+
tile_ids: List[str],
|
395
|
+
extract: bool = True,
|
396
|
+
file_pattern: Optional[str] = r".*\.tif$",
|
397
|
+
**kwargs,
|
398
|
+
) -> List[Optional[Union[Path, List[Path]]]]:
|
399
|
+
"""
|
400
|
+
Downloads multiple tiles in parallel, with an option to extract them.
|
401
|
+
|
402
|
+
Args:
|
403
|
+
tile_ids: A list of tile IDs to download.
|
404
|
+
extract: If True and the downloaded files are zips, extract their contents. Defaults to False.
|
405
|
+
file_pattern: Optional regex pattern to filter extracted files (if extract=True).
|
406
|
+
**kwargs: Additional parameters passed to download methods
|
407
|
+
|
408
|
+
Returns:
|
409
|
+
A list where each element corresponds to a tile ID and contains:
|
410
|
+
- Path to the downloaded file if extract=False.
|
411
|
+
- List of paths to extracted files if extract=True.
|
412
|
+
- None if the download or extraction failed for a tile.
|
413
|
+
"""
|
414
|
+
if not tile_ids:
|
415
|
+
self.logger.warning("No tiles to download")
|
416
|
+
return []
|
417
|
+
|
418
|
+
with multiprocessing.Pool(processes=self.config.n_workers) as pool:
|
419
|
+
download_func = functools.partial(
|
420
|
+
self.download_data_unit, extract=extract, file_pattern=file_pattern
|
421
|
+
)
|
422
|
+
file_paths = list(
|
423
|
+
tqdm(
|
424
|
+
pool.imap(download_func, tile_ids),
|
425
|
+
total=len(tile_ids),
|
426
|
+
desc=f"Downloading data",
|
427
|
+
)
|
428
|
+
)
|
429
|
+
|
430
|
+
return file_paths
|
431
|
+
|
432
|
+
def download(
|
433
|
+
self,
|
434
|
+
source: Union[
|
435
|
+
str, # country
|
436
|
+
List[Union[Tuple[float, float], Point]], # points
|
437
|
+
BaseGeometry, # shapely geoms
|
438
|
+
gpd.GeoDataFrame,
|
439
|
+
],
|
440
|
+
extract: bool = True,
|
441
|
+
file_pattern: Optional[str] = r".*\.tif$",
|
442
|
+
**kwargs,
|
443
|
+
) -> List[Optional[Union[Path, List[Path]]]]:
|
444
|
+
"""
|
445
|
+
Download GHSL data for a specified geographic region.
|
446
|
+
|
447
|
+
The region can be defined by a country code/name, a list of points,
|
448
|
+
a Shapely geometry, or a GeoDataFrame. This method identifies the
|
449
|
+
relevant GHSL tiles intersecting the region and downloads the
|
450
|
+
specified type of data (polygons or points) for those tiles in parallel.
|
451
|
+
|
452
|
+
Args:
|
453
|
+
source: Defines the geographic area for which to download data.
|
454
|
+
Can be:
|
455
|
+
- A string representing a country code or name.
|
456
|
+
- A list of (latitude, longitude) tuples or Shapely Point objects.
|
457
|
+
- A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
|
458
|
+
- A GeoDataFrame with geometry column in EPSG:4326.
|
459
|
+
extract: If True and the downloaded files are zips, extract their contents. Defaults to False.
|
460
|
+
file_pattern: Optional regex pattern to filter extracted files (if extract=True).
|
461
|
+
**kwargs: Additional keyword arguments. These will be passed down to
|
462
|
+
`AdminBoundaries.create()` (if `source` is a country)
|
463
|
+
and to `self.download_data_units()`.
|
464
|
+
|
465
|
+
Returns:
|
466
|
+
A list of local file paths for the successfully downloaded tiles.
|
467
|
+
Returns an empty list if no data is found for the region or if
|
468
|
+
all downloads fail.
|
469
|
+
"""
|
470
|
+
|
471
|
+
tiles = self.config.get_relevant_data_units(source, **kwargs)
|
472
|
+
return self.download_data_units(
|
473
|
+
tiles, extract=extract, file_pattern=file_pattern, **kwargs
|
474
|
+
)
|
475
|
+
|
476
|
+
def download_by_country(
|
477
|
+
self,
|
478
|
+
country_code: str,
|
479
|
+
data_store: Optional[DataStore] = None,
|
480
|
+
country_geom_path: Optional[Union[str, Path]] = None,
|
481
|
+
extract: bool = True,
|
482
|
+
file_pattern: Optional[str] = r".*\.tif$",
|
483
|
+
**kwargs,
|
484
|
+
) -> List[Optional[Union[Path, List[Path]]]]:
|
485
|
+
"""
|
486
|
+
Download GHSL data for a specific country.
|
487
|
+
|
488
|
+
This is a convenience method to download data for an entire country
|
489
|
+
using its code or name.
|
490
|
+
|
491
|
+
Args:
|
492
|
+
country_code: The country code (e.g., 'USA', 'GBR') or name.
|
493
|
+
data_store: Optional instance of a `DataStore` to be used by
|
494
|
+
`AdminBoundaries` for loading country boundaries. If None,
|
495
|
+
`AdminBoundaries` will use its default data loading.
|
496
|
+
country_geom_path: Optional path to a GeoJSON file containing the
|
497
|
+
country boundary. If provided, this boundary is used
|
498
|
+
instead of the default from `AdminBoundaries`.
|
499
|
+
extract: If True and the downloaded files are zips, extract their contents. Defaults to False.
|
500
|
+
file_pattern: Optional regex pattern to filter extracted files (if extract=True).
|
501
|
+
**kwargs: Additional keyword arguments that are passed to
|
502
|
+
`download_data_units`. For example, `extract` to download and extract.
|
503
|
+
|
504
|
+
Returns:
|
505
|
+
A list of local file paths for the successfully downloaded tiles
|
506
|
+
for the specified country.
|
507
|
+
"""
|
508
|
+
return self.download(
|
509
|
+
source=country_code,
|
510
|
+
data_store=data_store,
|
511
|
+
path=country_geom_path,
|
512
|
+
extract=extract,
|
513
|
+
file_pattern=file_pattern,
|
514
|
+
**kwargs,
|
515
|
+
)
|
516
|
+
|
517
|
+
def _download_file(self, url: str, output_path: Path) -> Optional[Path]:
|
518
|
+
"""
|
519
|
+
Downloads a file from a URL to a specified output path with a progress bar.
|
520
|
+
|
521
|
+
Args:
|
522
|
+
url: The URL to download from.
|
523
|
+
output_path: The local path to save the downloaded file.
|
524
|
+
|
525
|
+
Returns:
|
526
|
+
The path to the downloaded file on success, None on failure.
|
527
|
+
"""
|
528
|
+
try:
|
529
|
+
response = requests.get(url, stream=True)
|
530
|
+
response.raise_for_status()
|
531
|
+
|
532
|
+
total_size = int(response.headers.get("content-length", 0))
|
533
|
+
|
534
|
+
with self.data_store.open(str(output_path), "wb") as file:
|
535
|
+
with tqdm(
|
536
|
+
total=total_size,
|
537
|
+
unit="B",
|
538
|
+
unit_scale=True,
|
539
|
+
desc=f"Downloading {output_path.name}",
|
540
|
+
) as pbar:
|
541
|
+
for chunk in response.iter_content(chunk_size=8192):
|
542
|
+
if chunk:
|
543
|
+
file.write(chunk)
|
544
|
+
pbar.update(len(chunk))
|
545
|
+
|
546
|
+
self.logger.debug(f"Successfully downloaded: {url} to {output_path}")
|
547
|
+
return output_path
|
548
|
+
|
549
|
+
except requests.exceptions.RequestException as e:
|
550
|
+
self.logger.error(f"Failed to download {url}: {str(e)}")
|
551
|
+
return None
|
552
|
+
except Exception as e:
|
553
|
+
self.logger.error(f"Unexpected error downloading {url}: {str(e)}")
|
554
|
+
return None
|
555
|
+
|
556
|
+
|
557
|
+
class GHSLDataReader(BaseHandlerReader):
|
558
|
+
|
559
|
+
def __init__(
|
560
|
+
self,
|
561
|
+
config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
|
562
|
+
data_store: Optional[DataStore] = None,
|
563
|
+
logger: Optional[logging.Logger] = None,
|
564
|
+
):
|
565
|
+
"""
|
566
|
+
Initialize the downloader.
|
567
|
+
|
568
|
+
Args:
|
569
|
+
config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
|
570
|
+
data_store: Optional data storage interface. If not provided, uses LocalDataStore.
|
571
|
+
logger: Optional custom logger. If not provided, uses default logger.
|
572
|
+
"""
|
573
|
+
config = (
|
574
|
+
config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
|
575
|
+
)
|
576
|
+
super().__init__(config=config, data_store=data_store, logger=logger)
|
577
|
+
|
578
|
+
def load_from_paths(
|
579
|
+
self, source_data_path: List[Union[str, Path]], **kwargs
|
580
|
+
) -> List[TifProcessor]:
|
581
|
+
"""
|
582
|
+
Load TifProcessors from GHSL dataset.
|
583
|
+
Args:
|
584
|
+
source_data_path: List of file paths to load
|
585
|
+
Returns:
|
586
|
+
List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
|
587
|
+
"""
|
588
|
+
return self._load_raster_data(raster_paths=source_data_path)
|
589
|
+
|
590
|
+
def load(self, source, **kwargs):
|
591
|
+
return super().load(source=source, file_ext=".tif")
|
592
|
+
|
593
|
+
|
594
|
+
class GHSLDataHandler(BaseHandler):
|
595
|
+
"""
|
596
|
+
Handler for GHSL (Global Human Settlement Layer) dataset.
|
597
|
+
|
598
|
+
This class provides a unified interface for downloading and loading GHSL data.
|
599
|
+
It manages the lifecycle of configuration, downloading, and reading components.
|
600
|
+
"""
|
601
|
+
|
602
|
+
def __init__(
|
603
|
+
self,
|
604
|
+
product: Literal[
|
605
|
+
"GHS_BUILT_S",
|
606
|
+
"GHS_BUILT_H_AGBH",
|
607
|
+
"GHS_BUILT_H_ANBH",
|
608
|
+
"GHS_BUILT_V",
|
609
|
+
"GHS_POP",
|
610
|
+
"GHS_SMOD",
|
611
|
+
],
|
612
|
+
year: int = 2020,
|
613
|
+
resolution: int = 100,
|
614
|
+
config: Optional[GHSLDataConfig] = None,
|
615
|
+
downloader: Optional[GHSLDataDownloader] = None,
|
616
|
+
reader: Optional[GHSLDataReader] = None,
|
617
|
+
data_store: Optional[DataStore] = None,
|
618
|
+
logger: Optional[logging.Logger] = None,
|
619
|
+
**kwargs,
|
620
|
+
):
|
621
|
+
"""
|
622
|
+
Initialize the GHSLDataHandler.
|
623
|
+
|
624
|
+
Args:
|
625
|
+
product: The GHSL product to use. Must be one of:
|
626
|
+
- GHS_BUILT_S: Built-up surface
|
627
|
+
- GHS_BUILT_H_AGBH: Average building height
|
628
|
+
- GHS_BUILT_H_ANBH: Average number of building heights
|
629
|
+
- GHS_BUILT_V: Building volume
|
630
|
+
- GHS_POP: Population
|
631
|
+
- GHS_SMOD: Settlement model
|
632
|
+
year: The year of the data (default: 2020)
|
633
|
+
resolution: The resolution in meters (default: 100)
|
634
|
+
config: Optional configuration object
|
635
|
+
downloader: Optional downloader instance
|
636
|
+
reader: Optional reader instance
|
637
|
+
data_store: Optional data store instance
|
638
|
+
logger: Optional logger instance
|
639
|
+
**kwargs: Additional configuration parameters
|
640
|
+
"""
|
641
|
+
self._product = product
|
642
|
+
self._year = year
|
643
|
+
self._resolution = resolution
|
644
|
+
super().__init__(
|
645
|
+
config=config,
|
646
|
+
downloader=downloader,
|
647
|
+
reader=reader,
|
648
|
+
data_store=data_store,
|
649
|
+
logger=logger,
|
650
|
+
)
|
651
|
+
|
652
|
+
def create_config(
|
653
|
+
self, data_store: DataStore, logger: logging.Logger, **kwargs
|
654
|
+
) -> GHSLDataConfig:
|
655
|
+
"""
|
656
|
+
Create and return a GHSLDataConfig instance.
|
657
|
+
|
658
|
+
Args:
|
659
|
+
data_store: The data store instance to use
|
660
|
+
logger: The logger instance to use
|
661
|
+
**kwargs: Additional configuration parameters
|
662
|
+
|
663
|
+
Returns:
|
664
|
+
Configured GHSLDataConfig instance
|
665
|
+
"""
|
666
|
+
return GHSLDataConfig(
|
667
|
+
product=self._product,
|
668
|
+
year=self._year,
|
669
|
+
resolution=self._resolution,
|
670
|
+
data_store=data_store,
|
671
|
+
logger=logger,
|
672
|
+
**kwargs,
|
673
|
+
)
|
674
|
+
|
675
|
+
def create_downloader(
|
676
|
+
self,
|
677
|
+
config: GHSLDataConfig,
|
678
|
+
data_store: DataStore,
|
679
|
+
logger: logging.Logger,
|
680
|
+
**kwargs,
|
681
|
+
) -> GHSLDataDownloader:
|
682
|
+
"""
|
683
|
+
Create and return a GHSLDataDownloader instance.
|
684
|
+
|
685
|
+
Args:
|
686
|
+
config: The configuration object
|
687
|
+
data_store: The data store instance to use
|
688
|
+
logger: The logger instance to use
|
689
|
+
**kwargs: Additional downloader parameters
|
690
|
+
|
691
|
+
Returns:
|
692
|
+
Configured GHSLDataDownloader instance
|
693
|
+
"""
|
694
|
+
return GHSLDataDownloader(
|
695
|
+
config=config, data_store=data_store, logger=logger, **kwargs
|
696
|
+
)
|
697
|
+
|
698
|
+
def create_reader(
|
699
|
+
self,
|
700
|
+
config: GHSLDataConfig,
|
701
|
+
data_store: DataStore,
|
702
|
+
logger: logging.Logger,
|
703
|
+
**kwargs,
|
704
|
+
) -> GHSLDataReader:
|
705
|
+
"""
|
706
|
+
Create and return a GHSLDataReader instance.
|
707
|
+
|
708
|
+
Args:
|
709
|
+
config: The configuration object
|
710
|
+
data_store: The data store instance to use
|
711
|
+
logger: The logger instance to use
|
712
|
+
**kwargs: Additional reader parameters
|
713
|
+
|
714
|
+
Returns:
|
715
|
+
Configured GHSLDataReader instance
|
716
|
+
"""
|
717
|
+
return GHSLDataReader(
|
718
|
+
config=config, data_store=data_store, logger=logger, **kwargs
|
719
|
+
)
|
720
|
+
|
721
|
+
def load_data(
|
722
|
+
self,
|
723
|
+
source: Union[
|
724
|
+
str, # country
|
725
|
+
List[Union[tuple, Point]], # points
|
726
|
+
BaseGeometry, # geometry
|
727
|
+
gpd.GeoDataFrame, # geodataframe
|
728
|
+
Path, # path
|
729
|
+
List[Union[str, Path]], # list of paths
|
730
|
+
],
|
731
|
+
ensure_available: bool = True,
|
732
|
+
**kwargs,
|
733
|
+
):
|
734
|
+
return super().load_data(
|
735
|
+
source=source,
|
736
|
+
ensure_available=ensure_available,
|
737
|
+
file_ext=".tif",
|
738
|
+
extract=True,
|
739
|
+
file_pattern=r".*\.tif$",
|
740
|
+
**kwargs,
|
741
|
+
)
|
742
|
+
|
743
|
+
def load_into_dataframe(
|
744
|
+
self,
|
745
|
+
source: Union[
|
746
|
+
str, # country
|
747
|
+
List[Union[tuple, Point]], # points
|
748
|
+
BaseGeometry, # geometry
|
749
|
+
gpd.GeoDataFrame, # geodataframe
|
750
|
+
Path, # path
|
751
|
+
List[Union[str, Path]], # list of paths
|
752
|
+
],
|
753
|
+
ensure_available: bool = True,
|
754
|
+
**kwargs,
|
755
|
+
) -> pd.DataFrame:
|
756
|
+
"""
|
757
|
+
Load GHSL data into a pandas DataFrame.
|
758
|
+
|
759
|
+
Args:
|
760
|
+
source: The data source specification
|
761
|
+
ensure_available: If True, ensure data is downloaded before loading
|
762
|
+
**kwargs: Additional parameters passed to load methods
|
763
|
+
|
764
|
+
Returns:
|
765
|
+
DataFrame containing the GHSL data
|
766
|
+
"""
|
767
|
+
tif_processors = self.load_data(
|
768
|
+
source=source, ensure_available=ensure_available, **kwargs
|
769
|
+
)
|
770
|
+
return pd.concat(
|
771
|
+
[tp.to_dataframe() for tp in tif_processors], ignore_index=True
|
772
|
+
)
|