giga-spatial 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/METADATA +3 -1
- giga_spatial-0.6.6.dist-info/RECORD +50 -0
- gigaspatial/__init__.py +1 -1
- gigaspatial/config.py +29 -4
- gigaspatial/core/io/__init__.py +1 -0
- gigaspatial/core/io/data_api.py +3 -1
- gigaspatial/core/io/database.py +319 -0
- gigaspatial/generators/__init__.py +5 -1
- gigaspatial/generators/poi.py +300 -52
- gigaspatial/generators/zonal/__init__.py +2 -1
- gigaspatial/generators/zonal/admin.py +84 -0
- gigaspatial/generators/zonal/base.py +237 -81
- gigaspatial/generators/zonal/geometry.py +151 -53
- gigaspatial/generators/zonal/mercator.py +50 -19
- gigaspatial/grid/__init__.py +1 -1
- gigaspatial/grid/mercator_tiles.py +33 -10
- gigaspatial/handlers/__init__.py +8 -1
- gigaspatial/handlers/base.py +26 -6
- gigaspatial/handlers/boundaries.py +93 -18
- gigaspatial/handlers/ghsl.py +92 -15
- gigaspatial/handlers/rwi.py +5 -2
- gigaspatial/handlers/worldpop.py +771 -186
- gigaspatial/processing/algorithms.py +188 -0
- gigaspatial/processing/geo.py +204 -102
- gigaspatial/processing/tif_processor.py +220 -45
- giga_spatial-0.6.4.dist-info/RECORD +0 -47
- {giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/WHEEL +0 -0
- {giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/licenses/LICENSE +0 -0
- {giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/top_level.txt +0 -0
gigaspatial/handlers/worldpop.py
CHANGED
@@ -1,59 +1,395 @@
|
|
1
|
-
from pydantic import
|
1
|
+
from pydantic.dataclasses import dataclass
|
2
|
+
from pydantic import (
|
3
|
+
Field,
|
4
|
+
field_validator,
|
5
|
+
model_validator,
|
6
|
+
ConfigDict,
|
7
|
+
)
|
2
8
|
from pathlib import Path
|
9
|
+
import functools
|
10
|
+
import multiprocessing
|
3
11
|
import os
|
4
|
-
from typing import Optional, Union, Literal,
|
12
|
+
from typing import Optional, Union, Literal, List, Dict, Any
|
13
|
+
import numpy as np
|
5
14
|
import pandas as pd
|
15
|
+
import geopandas as gpd
|
6
16
|
import pycountry
|
7
17
|
import requests
|
18
|
+
from shapely.geometry.base import BaseGeometry
|
19
|
+
from shapely.geometry import Point
|
8
20
|
from tqdm import tqdm
|
9
|
-
from urllib.error import URLError
|
10
21
|
import logging
|
11
22
|
|
12
|
-
from gigaspatial.core.io.readers import *
|
13
|
-
from gigaspatial.core.io.writers import *
|
14
23
|
from gigaspatial.core.io.data_store import DataStore
|
15
|
-
from gigaspatial.
|
24
|
+
from gigaspatial.processing.tif_processor import TifProcessor
|
25
|
+
from gigaspatial.handlers.base import (
|
26
|
+
BaseHandlerConfig,
|
27
|
+
BaseHandlerDownloader,
|
28
|
+
BaseHandlerReader,
|
29
|
+
BaseHandler,
|
30
|
+
)
|
16
31
|
from gigaspatial.config import config as global_config
|
17
32
|
|
18
33
|
|
19
|
-
class
|
20
|
-
|
21
|
-
|
34
|
+
class WorldPopRestClient:
|
35
|
+
"""
|
36
|
+
REST API client for WorldPop data access.
|
22
37
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
SCHOOL_AGE_YEAR: int = 2020
|
38
|
+
This class provides direct access to the WorldPop REST API without any
|
39
|
+
configuration dependencies, allowing flexible integration patterns.
|
40
|
+
"""
|
27
41
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
)
|
36
|
-
|
42
|
+
def __init__(
|
43
|
+
self,
|
44
|
+
base_url: str = "https://www.worldpop.org/rest/data",
|
45
|
+
stats_url: str = "https://api.worldpop.org/v1/services/stats",
|
46
|
+
api_key: Optional[str] = None,
|
47
|
+
timeout: int = 30,
|
48
|
+
logger: Optional[logging.Logger] = None,
|
49
|
+
):
|
50
|
+
"""
|
51
|
+
Initialize the WorldPop REST API client.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
base_url: Base URL for the WorldPop REST API
|
55
|
+
stats_url: URL for the WorldPop statistics API
|
56
|
+
api_key: Optional API key for higher rate limits
|
57
|
+
timeout: Request timeout in seconds
|
58
|
+
logger: Optional logger instance
|
59
|
+
"""
|
60
|
+
self.base_url = base_url.rstrip("/")
|
61
|
+
self.stats_url = stats_url.rstrip("/")
|
62
|
+
self.api_key = api_key
|
63
|
+
self.timeout = timeout
|
64
|
+
self.logger = logger or logging.getLogger(self.__class__.__name__)
|
65
|
+
|
66
|
+
# Setup session with default headers
|
67
|
+
self.session = requests.Session()
|
68
|
+
self.session.headers.update(
|
69
|
+
{"Accept": "application/json", "User-Agent": "WorldPop-Python-Client/1.0"}
|
70
|
+
)
|
71
|
+
|
72
|
+
if self.api_key:
|
73
|
+
self.session.headers["X-API-Key"] = self.api_key
|
74
|
+
|
75
|
+
def get_available_projects(self) -> List[Dict[str, Any]]:
|
76
|
+
"""
|
77
|
+
Get list of all available projects (e.g., population, births, pregnancies, etc.).
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
List of project dictionaries with alias, name, title, and description
|
81
|
+
"""
|
82
|
+
try:
|
83
|
+
response = self.session.get(self.base_url, timeout=self.timeout)
|
84
|
+
response.raise_for_status()
|
85
|
+
data = response.json()
|
86
|
+
return data.get("data", [])
|
87
|
+
except requests.RequestException as e:
|
88
|
+
self.logger.error(f"Failed to fetch available project aliases: {e}")
|
89
|
+
return []
|
90
|
+
|
91
|
+
def get_project_sources(self, dataset_type: str) -> List[Dict[str, Any]]:
|
92
|
+
"""
|
93
|
+
Get available sources for a specific project type.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
dataset_type: Project type alias (e.g., 'pop', 'births', 'pregnancies')
|
97
|
+
|
98
|
+
Returns:
|
99
|
+
List of source dictionaries with alias and name
|
100
|
+
"""
|
101
|
+
try:
|
102
|
+
url = f"{self.base_url}/{dataset_type}"
|
103
|
+
response = self.session.get(url, timeout=self.timeout)
|
104
|
+
response.raise_for_status()
|
105
|
+
data = response.json()
|
106
|
+
return data.get("data", [])
|
107
|
+
except requests.RequestException as e:
|
108
|
+
self.logger.error(
|
109
|
+
f"Failed to fetch project sources for {dataset_type}: {e}"
|
110
|
+
)
|
111
|
+
return []
|
112
|
+
|
113
|
+
def get_source_entities(
|
114
|
+
self, dataset_type: str, category: str
|
115
|
+
) -> List[Dict[str, Any]]:
|
116
|
+
"""
|
117
|
+
Get list of entities (countries, global, continental) available for a specific project type and source.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
dataset_type: Project type alias (e.g., 'pop', 'births')
|
121
|
+
category: Source alias (e.g., 'wpgp', 'pic')
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
List of entity dictionaries with id and iso3 codes (if applicable)
|
125
|
+
"""
|
126
|
+
try:
|
127
|
+
url = f"{self.base_url}/{dataset_type}/{category}"
|
128
|
+
response = self.session.get(url, timeout=self.timeout)
|
129
|
+
response.raise_for_status()
|
130
|
+
data = response.json()
|
131
|
+
return data.get("data", [])
|
132
|
+
except requests.RequestException as e:
|
133
|
+
self.logger.error(
|
134
|
+
f"Failed to fetch entities for {dataset_type}/{category}: {e}"
|
135
|
+
)
|
136
|
+
return []
|
137
|
+
|
138
|
+
def get_datasets(self, dataset_type: str, category: str, params: dict):
|
139
|
+
"""
|
140
|
+
Get all datasets available for the params.
|
141
|
+
|
142
|
+
Args:
|
143
|
+
dataset_type: Dataset type alias (e.g., 'pop', 'births')
|
144
|
+
category: Category alias (e.g., 'wpgp', 'pic')
|
145
|
+
params: Query parameters (e.g., {'iso3`:'RWA'})
|
146
|
+
|
147
|
+
Returns:
|
148
|
+
List of dataset dictionaries with metadata and file information
|
149
|
+
"""
|
150
|
+
try:
|
151
|
+
url = f"{self.base_url}/{dataset_type}/{category}"
|
152
|
+
response = self.session.get(url, params=params, timeout=self.timeout)
|
153
|
+
response.raise_for_status()
|
154
|
+
data = response.json()
|
155
|
+
return data.get("data", [])
|
156
|
+
except requests.RequestException as e:
|
157
|
+
self.logger.error(f"Failed to fetch datasets for {params}: {e}")
|
158
|
+
return []
|
159
|
+
|
160
|
+
def get_datasets_by_country(
|
161
|
+
self, dataset_type: str, category: str, iso3: str
|
162
|
+
) -> List[Dict[str, Any]]:
|
163
|
+
"""
|
164
|
+
Get all datasets available for a specific country.
|
165
|
+
|
166
|
+
Args:
|
167
|
+
dataset_type: Dataset type alias (e.g., 'pop', 'births')
|
168
|
+
category: Category alias (e.g., 'wpgp', 'pic')
|
169
|
+
iso3: ISO3 country code (e.g., 'USA', 'BRA')
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
List of dataset dictionaries with metadata and file information
|
173
|
+
"""
|
174
|
+
params = {"iso3": iso3}
|
175
|
+
return self.get_datasets(dataset_type, category, params)
|
176
|
+
|
177
|
+
def get_dataset_by_id(
|
178
|
+
self, dataset_type: str, category: str, dataset_id: str
|
179
|
+
) -> Optional[Dict[str, Any]]:
|
180
|
+
"""
|
181
|
+
Get dataset information by ID.
|
182
|
+
|
183
|
+
Args:
|
184
|
+
dataset_type: Dataset type alias (e.g., 'pop', 'births')
|
185
|
+
category: Category alias (e.g., 'wpgp', 'pic')
|
186
|
+
dataset_id: Dataset ID
|
187
|
+
|
188
|
+
Returns:
|
189
|
+
Dataset dictionary or None if not found
|
190
|
+
"""
|
191
|
+
params = {"id": dataset_id}
|
192
|
+
return self.get_datasets(dataset_type, category, params)
|
193
|
+
|
194
|
+
def find_dataset(
|
195
|
+
self,
|
196
|
+
dataset_type: str,
|
197
|
+
category: str,
|
198
|
+
iso3: str,
|
199
|
+
year: Union[str, int],
|
200
|
+
**filters,
|
201
|
+
) -> Optional[Dict[str, Any]]:
|
202
|
+
"""
|
203
|
+
Find a specific dataset by year and optional filters.
|
204
|
+
|
205
|
+
Args:
|
206
|
+
dataset_type: Dataset type alias
|
207
|
+
category: Category alias
|
208
|
+
iso3: ISO3 country code
|
209
|
+
year: Year to search for
|
210
|
+
**filters: Additional filters (e.g., gender='F', resolution='1km')
|
211
|
+
|
212
|
+
Returns:
|
213
|
+
Dataset dictionary or None if not found
|
214
|
+
"""
|
215
|
+
datasets = self.get_country_datasets(dataset_type, category, iso3)
|
216
|
+
year_str = str(year)
|
217
|
+
|
218
|
+
for dataset in datasets:
|
219
|
+
if dataset.get("popyear") == year_str:
|
220
|
+
# Check additional filters
|
221
|
+
match = True
|
222
|
+
for key, value in filters.items():
|
223
|
+
if key in dataset and dataset[key] != value:
|
224
|
+
match = False
|
225
|
+
break
|
226
|
+
|
227
|
+
if match:
|
228
|
+
return dataset
|
229
|
+
|
230
|
+
return None
|
231
|
+
|
232
|
+
def list_years_for_country(
|
233
|
+
self, dataset_type: str, category: str, iso3: str
|
234
|
+
) -> List[int]:
|
235
|
+
"""
|
236
|
+
List all available years for a specific country and dataset.
|
237
|
+
|
238
|
+
Args:
|
239
|
+
dataset_type: Dataset type alias
|
240
|
+
category: Category alias
|
241
|
+
iso3: ISO3 country code
|
242
|
+
|
243
|
+
Returns:
|
244
|
+
Sorted list of available years
|
245
|
+
"""
|
246
|
+
datasets = self.get_datasets_by_country(dataset_type, category, iso3)
|
247
|
+
years = []
|
248
|
+
|
249
|
+
for dataset in datasets:
|
250
|
+
try:
|
251
|
+
year = int(dataset.get("popyear", 0))
|
252
|
+
if year > 0:
|
253
|
+
years.append(year)
|
254
|
+
except (ValueError, TypeError):
|
255
|
+
continue
|
256
|
+
|
257
|
+
return sorted(years)
|
258
|
+
|
259
|
+
def search_datasets(
|
260
|
+
self,
|
261
|
+
dataset_type: Optional[str] = None,
|
262
|
+
category: Optional[str] = None,
|
263
|
+
iso3: Optional[str] = None,
|
264
|
+
year: Optional[Union[str, int]] = None,
|
265
|
+
**filters,
|
266
|
+
) -> List[Dict[str, Any]]:
|
267
|
+
"""
|
268
|
+
Search for datasets with flexible filtering.
|
269
|
+
|
270
|
+
Args:
|
271
|
+
dataset_type: Optional dataset type filter
|
272
|
+
category: Optional category filter
|
273
|
+
iso3: Optional country filter
|
274
|
+
year: Optional year filter
|
275
|
+
**filters: Additional filters
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
List of matching datasets
|
279
|
+
"""
|
280
|
+
results = []
|
281
|
+
|
282
|
+
if dataset_type:
|
283
|
+
if category:
|
284
|
+
# If we have country-specific filters
|
285
|
+
if iso3:
|
286
|
+
datasets = self.get_datasets_by_country(
|
287
|
+
dataset_type, category, iso3
|
288
|
+
)
|
289
|
+
for dataset in datasets:
|
290
|
+
match = True
|
291
|
+
|
292
|
+
# Check year filter
|
293
|
+
if year and dataset.get("popyear") != str(year):
|
294
|
+
match = False
|
295
|
+
|
296
|
+
# Check additional filters
|
297
|
+
for key, value in filters.items():
|
298
|
+
if key in dataset and dataset[key] != value:
|
299
|
+
match = False
|
300
|
+
break
|
301
|
+
|
302
|
+
if match:
|
303
|
+
results.append(dataset)
|
304
|
+
else:
|
305
|
+
return self.get_source_entities(dataset_type, category)
|
306
|
+
else:
|
307
|
+
return self.get_project_sources(dataset_type)
|
308
|
+
else:
|
309
|
+
return self.get_available_projects()
|
310
|
+
|
311
|
+
return results
|
312
|
+
|
313
|
+
def get_dataset_info(self, dataset: Dict[str, Any]) -> Dict[str, Any]:
|
314
|
+
"""
|
315
|
+
Extract useful information from a dataset dictionary.
|
316
|
+
|
317
|
+
Args:
|
318
|
+
dataset: Dataset dictionary from API
|
319
|
+
|
320
|
+
Returns:
|
321
|
+
Cleaned dataset information
|
322
|
+
"""
|
323
|
+
return {
|
324
|
+
"id": dataset.get("id"),
|
325
|
+
"title": dataset.get("title"),
|
326
|
+
"description": dataset.get("desc"),
|
327
|
+
"doi": dataset.get("doi"),
|
328
|
+
"citation": dataset.get("citation"),
|
329
|
+
"data_format": dataset.get("data_format"),
|
330
|
+
"year": dataset.get("popyear"),
|
331
|
+
"country": dataset.get("country"),
|
332
|
+
"iso3": dataset.get("iso3"),
|
333
|
+
"continent": dataset.get("continent"),
|
334
|
+
"download_urls": dataset.get("files", []),
|
335
|
+
"image_url": dataset.get("url_img"),
|
336
|
+
"summary_url": dataset.get("url_summary"),
|
337
|
+
"license": dataset.get("license"),
|
338
|
+
"organization": dataset.get("organisation"),
|
339
|
+
"author": dataset.get("author_name"),
|
340
|
+
"maintainer": dataset.get("maintainer_name"),
|
341
|
+
"project": dataset.get("project"),
|
342
|
+
"category": dataset.get("category"),
|
343
|
+
"date_created": dataset.get("date"),
|
344
|
+
"public": dataset.get("public") == "Y",
|
345
|
+
"archived": dataset.get("archive") == "Y",
|
346
|
+
}
|
347
|
+
|
348
|
+
def close(self):
|
349
|
+
"""Close the session."""
|
350
|
+
self.session.close()
|
351
|
+
|
352
|
+
def __enter__(self):
|
353
|
+
"""Context manager entry."""
|
354
|
+
return self
|
355
|
+
|
356
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
357
|
+
"""Context manager exit."""
|
358
|
+
self.close()
|
359
|
+
|
360
|
+
|
361
|
+
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
362
|
+
class WPPopulationConfig(BaseHandlerConfig):
|
363
|
+
|
364
|
+
client = WorldPopRestClient()
|
365
|
+
|
366
|
+
AVAILABLE_YEARS: List = Field(default=np.append(np.arange(2000, 2021), 2024))
|
367
|
+
AVAILABLE_RESOLUTIONS: List = Field(default=[100, 1000])
|
37
368
|
|
38
369
|
# user config
|
39
370
|
base_path: Path = Field(default=global_config.get_path("worldpop", "bronze"))
|
40
|
-
|
41
|
-
year: int = Field(
|
42
|
-
resolution:
|
43
|
-
|
44
|
-
|
45
|
-
)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
371
|
+
project: Literal["pop", "age_structures"] = Field(...)
|
372
|
+
year: int = Field(...)
|
373
|
+
resolution: int = Field(...)
|
374
|
+
un_adjusted: bool = Field(...)
|
375
|
+
constrained: bool = Field(...)
|
376
|
+
school_age: bool = Field(...)
|
377
|
+
|
378
|
+
@field_validator("year")
|
379
|
+
def validate_year(cls, value: str) -> int:
|
380
|
+
if value in cls.AVAILABLE_YEARS:
|
381
|
+
return value
|
382
|
+
raise ValueError(
|
383
|
+
f"No datasets found for the provided year: {value}\nAvailable years are: {cls.AVAILABLE_YEARS}"
|
384
|
+
)
|
385
|
+
|
386
|
+
@field_validator("resolution")
|
387
|
+
def validate_resolution(cls, value: str) -> int:
|
388
|
+
if value in cls.AVAILABLE_RESOLUTIONS:
|
389
|
+
return value
|
390
|
+
raise ValueError(
|
391
|
+
f"No datasets found for the provided resolution: {value}\nAvailable resolutions are: {cls.AVAILABLE_RESOLUTIONS}"
|
392
|
+
)
|
57
393
|
|
58
394
|
@model_validator(mode="after")
|
59
395
|
def validate_configuration(self):
|
@@ -61,145 +397,191 @@ class WorldPopConfig(BaseModel):
|
|
61
397
|
Validate that the configuration is valid based on dataset availability constraints.
|
62
398
|
|
63
399
|
Specific rules:
|
64
|
-
-
|
65
|
-
|
400
|
+
- For age_structures:
|
401
|
+
- School age data is only available for 2020 at 1km resolution.
|
402
|
+
- Non-school age data is only available at 100m resolution.
|
403
|
+
- Unconstrained, non-school age data is only available without UN adjustment.
|
404
|
+
- Constrained, non-school age data with UN adjustment is only available for 2020.
|
405
|
+
- Constrained, non-school age data without UN adjustment is only available for 2020 and 2024.
|
406
|
+
- For pop:
|
407
|
+
- 2024 data is only available at 100m resolution and without UN adjustment.
|
408
|
+
- Constrained data (other than 2024) is only available for 2020 at 100m resolution.
|
409
|
+
- Unconstrained data at 100m or 1km is available for other years, with or without UN adjustment.
|
66
410
|
"""
|
67
|
-
if self.year > self.SCHOOL_AGE_YEAR:
|
68
|
-
if self.resolution != "LOW":
|
69
|
-
raise ValueError(
|
70
|
-
f"Data for year {self.year} is only available at LOW (1km) resolution"
|
71
|
-
)
|
72
|
-
|
73
|
-
if not self.un_adjusted:
|
74
|
-
raise ValueError(
|
75
|
-
f"Data for year {self.year} is only available with UN adjustment"
|
76
|
-
)
|
77
411
|
|
78
|
-
if self.
|
79
|
-
if self.resolution != "LOW":
|
80
|
-
raise ValueError(
|
81
|
-
f"School age data is only available at LOW (1km) resolution"
|
82
|
-
)
|
412
|
+
if self.project == "age_structures":
|
83
413
|
|
84
|
-
if self.
|
85
|
-
self.
|
86
|
-
|
414
|
+
if self.school_age:
|
415
|
+
if self.resolution == 100:
|
416
|
+
self.logger.warning(
|
417
|
+
"School age population datasets are only available at 1km `resolution`, resolution is set as 1000"
|
418
|
+
)
|
419
|
+
self.resolution = 1000
|
87
420
|
|
88
|
-
|
421
|
+
if self.year != 2020:
|
422
|
+
self.logger.warning(
|
423
|
+
"School age population datasets are only available for 2020, `year` is set as 2020"
|
424
|
+
)
|
425
|
+
self.year = 2020
|
89
426
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
return self._dataset_url
|
96
|
-
|
97
|
-
@property
|
98
|
-
def dataset_path(self) -> Path:
|
99
|
-
"""Construct and return the path for the configured dataset."""
|
100
|
-
url_parts = self.dataset_url.split("/")
|
101
|
-
file_path = (
|
102
|
-
"/".join(
|
103
|
-
[url_parts[4], url_parts[5], url_parts[7], self.country, url_parts[-1]]
|
104
|
-
)
|
105
|
-
if self.school_age
|
106
|
-
else "/".join([url_parts[4], url_parts[6], self.country, url_parts[-1]])
|
107
|
-
)
|
108
|
-
return self.base_path / file_path
|
427
|
+
if self.un_adjusted:
|
428
|
+
self.logger.warning(
|
429
|
+
"School age population datasets are only available without UN adjustment, `un_adjusted` is set as False"
|
430
|
+
)
|
431
|
+
self.un_adjusted = False
|
109
432
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
433
|
+
if self.constrained:
|
434
|
+
self.logger.warning(
|
435
|
+
"School age population datasets are only available unconstrained, `constrained` is set as False"
|
436
|
+
)
|
437
|
+
self.constrained = False
|
114
438
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
def _compute_dataset_url(self) -> str:
|
124
|
-
"""Construct and return the URL for the configured dataset."""
|
125
|
-
# handle post-2020 datasets
|
126
|
-
if self.year > self.SCHOOL_AGE_YEAR:
|
127
|
-
return (
|
128
|
-
str(self.WORLDPOP_DB_BASE_URL)
|
129
|
-
+ self.PPP_2021_2022_PATH
|
130
|
-
+ f"{'' if self.constrained else 'un'}constrained/{self.year}/{self.country}/{self.country.lower()}_ppp_{self.year}_1km_UNadj{'_constrained' if self.constrained else ''}.tif"
|
131
|
-
)
|
439
|
+
self.dataset_category = "sapya1km"
|
440
|
+
else:
|
441
|
+
if self.resolution == 1000:
|
442
|
+
self.logger.warning(
|
443
|
+
"Age structures datasets are only available at 100m resolution, `resolution` is set as 100"
|
444
|
+
)
|
445
|
+
self.resolution = 100
|
132
446
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
)
|
447
|
+
if not self.constrained:
|
448
|
+
if self.un_adjusted:
|
449
|
+
self.logger.warning(
|
450
|
+
"Age structures unconstrained datasets are only available without UN adjustment, `un_adjusted` is set as False"
|
451
|
+
)
|
452
|
+
self.un_adjusted = False
|
140
453
|
|
141
|
-
|
142
|
-
|
454
|
+
self.dataset_category = (
|
455
|
+
"G2_UC_Age_2024_100m" if self.year == 2024 else "aswpgp"
|
456
|
+
)
|
457
|
+
else:
|
458
|
+
if self.un_adjusted:
|
459
|
+
if self.year != 2020:
|
460
|
+
self.logger.warning(
|
461
|
+
"Age structures constrained datasets with UN adjustment are only available for 2020, `year` is set as 2020"
|
462
|
+
)
|
463
|
+
self.year = 2020
|
464
|
+
self.dataset_category = "ascicua_2020"
|
465
|
+
else:
|
466
|
+
if self.year == 2024:
|
467
|
+
self.dataset_category = "G2_CN_Age_2024_100m"
|
468
|
+
elif self.year == 2020:
|
469
|
+
self.dataset_category = "ascic_2020"
|
470
|
+
else:
|
471
|
+
raise ValueError(
|
472
|
+
"Age structures constrained datasets without UN adjustment are only available for 2020 and 2024, please set `year` to one of the available options: 2020, 2024"
|
473
|
+
)
|
474
|
+
|
475
|
+
elif self.project == "pop":
|
476
|
+
|
477
|
+
if self.school_age:
|
478
|
+
raise ValueError(
|
479
|
+
f"""
|
480
|
+
Received unexpected value of `{self.school_age}` for project: `{self.project}`.
|
481
|
+
For school age population datasets, please set project as `age_structures`.
|
482
|
+
"""
|
483
|
+
)
|
143
484
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
(wp_metadata.ISO3 == self.country)
|
149
|
-
& (
|
150
|
-
wp_metadata.Covariate
|
151
|
-
== "ppp_"
|
152
|
-
+ str(self.year)
|
153
|
-
+ ("_UNadj" if self.un_adjusted else "")
|
485
|
+
if self.year == 2024:
|
486
|
+
if self.resolution == 1000:
|
487
|
+
self.logger.warning(
|
488
|
+
"2024 datasets are only available at 100m resolution, `resolution` is set as 100m"
|
154
489
|
)
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
490
|
+
self.resolution = 100
|
491
|
+
if self.un_adjusted:
|
492
|
+
self.logger.warning(
|
493
|
+
"2024 datasets are only available without UN adjustment, `un_adjusted` is set as False"
|
494
|
+
)
|
495
|
+
self.un_adjusted = False
|
161
496
|
|
162
|
-
|
163
|
-
|
164
|
-
|
497
|
+
self.dataset_category = (
|
498
|
+
"G2_CN_POP_2024_100m" if self.constrained else "G2_UC_POP_2024_100m"
|
499
|
+
)
|
500
|
+
else:
|
501
|
+
if self.constrained:
|
502
|
+
if self.year != 2020:
|
503
|
+
self.logger.warning(
|
504
|
+
"Population constrained datasets are only available for 2020, `year` is set as 2020"
|
505
|
+
)
|
506
|
+
self.year = 2020
|
507
|
+
|
508
|
+
if self.resolution != 100:
|
509
|
+
self.logger.warning(
|
510
|
+
"Population constrained datasets are only available at 100m resolution, `resolution` is set as 100"
|
511
|
+
)
|
512
|
+
self.resolution = 100
|
513
|
+
|
514
|
+
self.dataset_category = (
|
515
|
+
"cic2020_UNadj_100m" if self.un_adjusted else "cic2020_100m"
|
516
|
+
)
|
517
|
+
else:
|
518
|
+
if self.resolution == 100:
|
519
|
+
self.dataset_category = (
|
520
|
+
f"wpgp{'unadj' if self.un_adjusted else ''}"
|
521
|
+
)
|
522
|
+
else:
|
523
|
+
self.dataset_category = (
|
524
|
+
"wpic1km" if not self.un_adjusted else "wpicuadj1km"
|
525
|
+
)
|
526
|
+
|
527
|
+
def get_relevant_data_units_by_geometry(
|
528
|
+
self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
|
529
|
+
) -> List[Dict[str, Any]]:
|
530
|
+
raise NotImplementedError(
|
531
|
+
"WorldPop does not support geometry-based filtering. "
|
532
|
+
"Please use country-based filtering or direct resource filtering instead."
|
533
|
+
)
|
165
534
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
)
|
173
|
-
dataset_url = "/".join(url_parts)
|
535
|
+
def get_relevant_data_units_by_points(
|
536
|
+
self, points: List[Union[Point, tuple]], **kwargs
|
537
|
+
) -> List[Dict[str, Any]]:
|
538
|
+
raise NotImplementedError(
|
539
|
+
"WorldPop does not support point-based filtering. "
|
540
|
+
"Please use country-based filtering or direct resource filtering instead."
|
541
|
+
)
|
174
542
|
|
175
|
-
|
543
|
+
def get_relevant_data_units_by_country(
|
544
|
+
self, country: str, **kwargs
|
545
|
+
) -> List[Dict[str, Any]]:
|
546
|
+
iso3 = pycountry.countries.lookup(country).alpha_3
|
176
547
|
|
177
|
-
|
548
|
+
datasets = self.client.search_datasets(
|
549
|
+
self.project, self.dataset_category, iso3, self.year
|
550
|
+
)
|
178
551
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
f" resolution={self.resolution}",
|
184
|
-
f" un_adjusted={self.un_adjusted}",
|
185
|
-
f" constrained={self.constrained}",
|
552
|
+
files = [
|
553
|
+
file
|
554
|
+
for file in datasets[0].get("files", [])
|
555
|
+
if ((self.dataset_category == "sapya1km") or file.endswith(".tif"))
|
186
556
|
]
|
187
557
|
|
188
|
-
|
189
|
-
parts.append(f" school_age='{self.school_age}'")
|
190
|
-
parts.append(f" gender='{self.gender}'")
|
558
|
+
return files
|
191
559
|
|
192
|
-
|
560
|
+
def get_data_unit_path(self, unit: str, **kwargs) -> Path:
|
561
|
+
"""
|
562
|
+
Given a WP file url, return the corresponding path.
|
563
|
+
"""
|
564
|
+
return self.base_path / unit.split("GIS/")[1]
|
193
565
|
|
194
|
-
|
566
|
+
def __repr__(self) -> str:
|
195
567
|
|
568
|
+
return (
|
569
|
+
f"WPPopulationConfig(",
|
570
|
+
f"project={self.project}, "
|
571
|
+
f"year={self.year}, "
|
572
|
+
f"resolution={self.resolution}, "
|
573
|
+
f"un_adjusted={self.un_adjusted}, "
|
574
|
+
f"constrained={self.constrained}, "
|
575
|
+
f"school_age={self.school_age}, "
|
576
|
+
f")",
|
577
|
+
)
|
196
578
|
|
197
|
-
|
198
|
-
|
579
|
+
|
580
|
+
class WPPopulationDownloader(BaseHandlerDownloader):
|
199
581
|
|
200
582
|
def __init__(
|
201
583
|
self,
|
202
|
-
config: Union[
|
584
|
+
config: Union[WPPopulationConfig, dict[str, Union[str, int]]],
|
203
585
|
data_store: Optional[DataStore] = None,
|
204
586
|
logger: Optional[logging.Logger] = None,
|
205
587
|
):
|
@@ -207,60 +589,263 @@ class WorldPopDownloader:
|
|
207
589
|
Initialize the downloader.
|
208
590
|
|
209
591
|
Args:
|
210
|
-
config: Configuration for the WorldPop dataset, either as a
|
592
|
+
config: Configuration for the WorldPop dataset, either as a WPPopulationConfig object or a dictionary of parameters
|
211
593
|
data_store: Optional data storage interface. If not provided, uses LocalDataStore.
|
212
594
|
logger: Optional custom logger. If not provided, uses default logger.
|
213
595
|
"""
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
596
|
+
config = (
|
597
|
+
config
|
598
|
+
if isinstance(config, WPPopulationConfig)
|
599
|
+
else WPPopulationConfig(**config)
|
218
600
|
)
|
601
|
+
super().__init__(config=config, data_store=data_store, logger=logger)
|
219
602
|
|
220
|
-
|
221
|
-
|
222
|
-
"""
|
223
|
-
Create a downloader instance from country and year.
|
224
|
-
|
225
|
-
Args:
|
226
|
-
country: Country code or name
|
227
|
-
year: Year of the dataset
|
228
|
-
**kwargs: Additional parameters for WorldPopConfig or the downloader
|
229
|
-
"""
|
230
|
-
return cls({"country": country, "year": year}, **kwargs)
|
231
|
-
|
232
|
-
def download_dataset(self) -> str:
|
233
|
-
"""
|
234
|
-
Download the configured dataset to the provided output path.
|
235
|
-
"""
|
236
|
-
|
603
|
+
def download_data_unit(self, url, **kwargs):
|
604
|
+
"""Download data file for a url."""
|
237
605
|
try:
|
238
|
-
response =
|
606
|
+
response = self.config.client.session.get(
|
607
|
+
url, stream=True, timeout=self.config.client.timeout
|
608
|
+
)
|
239
609
|
response.raise_for_status()
|
240
610
|
|
241
|
-
output_path = str(self.config.dataset_path)
|
242
|
-
|
243
611
|
total_size = int(response.headers.get("content-length", 0))
|
612
|
+
file_path = self.config.get_data_unit_path(url)
|
244
613
|
|
245
|
-
with self.data_store.open(
|
614
|
+
with self.data_store.open(file_path, "wb") as file:
|
246
615
|
with tqdm(
|
247
616
|
total=total_size,
|
248
617
|
unit="B",
|
249
618
|
unit_scale=True,
|
250
|
-
desc=f"Downloading {os.path.basename(
|
619
|
+
desc=f"Downloading {os.path.basename(file_path)}",
|
251
620
|
) as pbar:
|
252
621
|
for chunk in response.iter_content(chunk_size=8192):
|
253
622
|
if chunk:
|
254
623
|
file.write(chunk)
|
255
624
|
pbar.update(len(chunk))
|
256
625
|
|
257
|
-
self.logger.
|
626
|
+
self.logger.info(f"Successfully downloaded: {file_path}")
|
627
|
+
return file_path
|
258
628
|
|
259
|
-
|
260
|
-
|
261
|
-
except requests.exceptions.RequestException as e:
|
262
|
-
self.logger.error(f"Failed to download dataset {self.config}: {str(e)}")
|
629
|
+
except requests.RequestException as e:
|
630
|
+
self.logger.error(f"Failed to download {url}: {e}")
|
263
631
|
return None
|
264
632
|
except Exception as e:
|
265
|
-
self.logger.error(f"Unexpected error downloading
|
633
|
+
self.logger.error(f"Unexpected error downloading {url}: {e}")
|
266
634
|
return None
|
635
|
+
|
636
|
+
def download_data_units(
|
637
|
+
self,
|
638
|
+
urls: List[str],
|
639
|
+
) -> List[str]:
|
640
|
+
"""Download data files for multiple urls."""
|
641
|
+
|
642
|
+
with multiprocessing.Pool(self.config.n_workers) as pool:
|
643
|
+
download_func = functools.partial(self.download_data_unit)
|
644
|
+
file_paths = list(
|
645
|
+
tqdm(
|
646
|
+
pool.imap(download_func, urls),
|
647
|
+
total=len(urls),
|
648
|
+
desc=f"Downloading data",
|
649
|
+
)
|
650
|
+
)
|
651
|
+
|
652
|
+
return [path for path in file_paths if path is not None]
|
653
|
+
|
654
|
+
def download(self, source: str, **kwargs) -> List[str]:
|
655
|
+
"""Download data for a source"""
|
656
|
+
resources = self.config.get_relevant_data_units(source, **kwargs)
|
657
|
+
return self.download_data_units(resources)
|
658
|
+
|
659
|
+
|
660
|
+
class WPPopulationReader(BaseHandlerReader):
|
661
|
+
|
662
|
+
def __init__(
|
663
|
+
self,
|
664
|
+
config: Union[WPPopulationConfig, dict[str, Union[str, int]]],
|
665
|
+
data_store: Optional[DataStore] = None,
|
666
|
+
logger: Optional[logging.Logger] = None,
|
667
|
+
):
|
668
|
+
"""
|
669
|
+
Initialize the reader.
|
670
|
+
|
671
|
+
Args:
|
672
|
+
config: Configuration for the WorldPop dataset, either as a WPPopulationConfig object or a dictionary of parameters
|
673
|
+
data_store: Optional data storage interface. If not provided, uses LocalDataStore.
|
674
|
+
logger: Optional custom logger. If not provided, uses default logger.
|
675
|
+
"""
|
676
|
+
config = (
|
677
|
+
config
|
678
|
+
if isinstance(config, WPPopulationConfig)
|
679
|
+
else WPPopulationConfig(**config)
|
680
|
+
)
|
681
|
+
super().__init__(config=config, data_store=data_store, logger=logger)
|
682
|
+
|
683
|
+
def load_from_paths(
|
684
|
+
self, source_data_path: List[Union[str, Path]], **kwargs
|
685
|
+
) -> List[TifProcessor]:
|
686
|
+
"""
|
687
|
+
Load TifProcessors of WP datasets.
|
688
|
+
Args:
|
689
|
+
source_data_path: List of file paths to load
|
690
|
+
Returns:
|
691
|
+
List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
|
692
|
+
"""
|
693
|
+
return self._load_raster_data(raster_paths=source_data_path)
|
694
|
+
|
695
|
+
|
696
|
+
class WPPopulationHandler(BaseHandler):
|
697
|
+
"""
|
698
|
+
Handler for WorldPop Populations datasets.
|
699
|
+
|
700
|
+
This class provides a unified interface for downloading and loading WP Population data.
|
701
|
+
It manages the lifecycle of configuration, downloading, and reading components.
|
702
|
+
"""
|
703
|
+
|
704
|
+
def __init__(
|
705
|
+
self,
|
706
|
+
project: Literal["pop", "age_structures"] = "pop",
|
707
|
+
year: int = 2020,
|
708
|
+
resolution: int = 1000,
|
709
|
+
un_adjusted: bool = True,
|
710
|
+
constrained: bool = False,
|
711
|
+
school_age: bool = False,
|
712
|
+
config: Optional[WPPopulationConfig] = None,
|
713
|
+
downloader: Optional[WPPopulationDownloader] = None,
|
714
|
+
reader: Optional[WPPopulationReader] = None,
|
715
|
+
data_store: Optional[DataStore] = None,
|
716
|
+
logger: Optional[logging.Logger] = None,
|
717
|
+
**kwargs,
|
718
|
+
):
|
719
|
+
self._project = project
|
720
|
+
self._year = year
|
721
|
+
self._resolution = resolution
|
722
|
+
self._un_adjusted = un_adjusted
|
723
|
+
self._constrained = constrained
|
724
|
+
self._school_age = school_age
|
725
|
+
super().__init__(
|
726
|
+
config=config,
|
727
|
+
downloader=downloader,
|
728
|
+
reader=reader,
|
729
|
+
data_store=data_store,
|
730
|
+
logger=logger,
|
731
|
+
)
|
732
|
+
|
733
|
+
def create_config(
|
734
|
+
self, data_store: DataStore, logger: logging.Logger, **kwargs
|
735
|
+
) -> WPPopulationConfig:
|
736
|
+
"""
|
737
|
+
Create and return a WPPopulationConfig instance.
|
738
|
+
|
739
|
+
Args:
|
740
|
+
data_store: The data store instance to use
|
741
|
+
logger: The logger instance to use
|
742
|
+
**kwargs: Additional configuration parameters
|
743
|
+
|
744
|
+
Returns:
|
745
|
+
Configured WPPopulationConfig instance
|
746
|
+
"""
|
747
|
+
return WPPopulationConfig(
|
748
|
+
project=self._project,
|
749
|
+
year=self._year,
|
750
|
+
resolution=self._resolution,
|
751
|
+
un_adjusted=self._un_adjusted,
|
752
|
+
constrained=self._constrained,
|
753
|
+
school_age=self._school_age,
|
754
|
+
data_store=data_store,
|
755
|
+
logger=logger,
|
756
|
+
**kwargs,
|
757
|
+
)
|
758
|
+
|
759
|
+
def create_downloader(
|
760
|
+
self,
|
761
|
+
config: WPPopulationConfig,
|
762
|
+
data_store: DataStore,
|
763
|
+
logger: logging.Logger,
|
764
|
+
**kwargs,
|
765
|
+
) -> WPPopulationDownloader:
|
766
|
+
"""
|
767
|
+
Create and return a WPPopulationDownloader instance.
|
768
|
+
|
769
|
+
Args:
|
770
|
+
config: The configuration object
|
771
|
+
data_store: The data store instance to use
|
772
|
+
logger: The logger instance to use
|
773
|
+
**kwargs: Additional downloader parameters
|
774
|
+
|
775
|
+
Returns:
|
776
|
+
Configured WPPopulationDownloader instance
|
777
|
+
"""
|
778
|
+
return WPPopulationDownloader(
|
779
|
+
config=config, data_store=data_store, logger=logger, **kwargs
|
780
|
+
)
|
781
|
+
|
782
|
+
def create_reader(
|
783
|
+
self,
|
784
|
+
config: WPPopulationConfig,
|
785
|
+
data_store: DataStore,
|
786
|
+
logger: logging.Logger,
|
787
|
+
**kwargs,
|
788
|
+
) -> WPPopulationReader:
|
789
|
+
"""
|
790
|
+
Create and return a WPPopulationReader instance.
|
791
|
+
|
792
|
+
Args:
|
793
|
+
config: The configuration object
|
794
|
+
data_store: The data store instance to use
|
795
|
+
logger: The logger instance to use
|
796
|
+
**kwargs: Additional reader parameters
|
797
|
+
|
798
|
+
Returns:
|
799
|
+
Configured WPPopulationReader instance
|
800
|
+
"""
|
801
|
+
return WPPopulationReader(
|
802
|
+
config=config, data_store=data_store, logger=logger, **kwargs
|
803
|
+
)
|
804
|
+
|
805
|
+
def load_into_dataframe(
|
806
|
+
self,
|
807
|
+
source: str,
|
808
|
+
ensure_available: bool = True,
|
809
|
+
**kwargs,
|
810
|
+
) -> pd.DataFrame:
|
811
|
+
"""
|
812
|
+
Load GHSL data into a pandas DataFrame.
|
813
|
+
|
814
|
+
Args:
|
815
|
+
source: The data source specification
|
816
|
+
ensure_available: If True, ensure data is downloaded before loading
|
817
|
+
**kwargs: Additional parameters passed to load methods
|
818
|
+
|
819
|
+
Returns:
|
820
|
+
DataFrame containing the GHSL data
|
821
|
+
"""
|
822
|
+
tif_processors = self.load_data(
|
823
|
+
source=source, ensure_available=ensure_available, **kwargs
|
824
|
+
)
|
825
|
+
return pd.concat(
|
826
|
+
[tp.to_dataframe() for tp in tif_processors], ignore_index=True
|
827
|
+
)
|
828
|
+
|
829
|
+
def load_into_geodataframe(
|
830
|
+
self,
|
831
|
+
source: str,
|
832
|
+
ensure_available: bool = True,
|
833
|
+
**kwargs,
|
834
|
+
) -> gpd.GeoDataFrame:
|
835
|
+
"""
|
836
|
+
Load GHSL data into a geopandas GeoDataFrame.
|
837
|
+
|
838
|
+
Args:
|
839
|
+
source: The data source specification
|
840
|
+
ensure_available: If True, ensure data is downloaded before loading
|
841
|
+
**kwargs: Additional parameters passed to load methods
|
842
|
+
|
843
|
+
Returns:
|
844
|
+
GeoDataFrame containing the GHSL data
|
845
|
+
"""
|
846
|
+
tif_processors = self.load_data(
|
847
|
+
source=source, ensure_available=ensure_available, **kwargs
|
848
|
+
)
|
849
|
+
return pd.concat(
|
850
|
+
[tp.to_geodataframe() for tp in tif_processors], ignore_index=True
|
851
|
+
)
|