giga-spatial 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,59 +1,395 @@
1
- from pydantic import BaseModel, Field, HttpUrl, field_validator, model_validator
1
+ from pydantic.dataclasses import dataclass
2
+ from pydantic import (
3
+ Field,
4
+ field_validator,
5
+ model_validator,
6
+ ConfigDict,
7
+ )
2
8
  from pathlib import Path
9
+ import functools
10
+ import multiprocessing
3
11
  import os
4
- from typing import Optional, Union, Literal, ClassVar
12
+ from typing import Optional, Union, Literal, List, Dict, Any
13
+ import numpy as np
5
14
  import pandas as pd
15
+ import geopandas as gpd
6
16
  import pycountry
7
17
  import requests
18
+ from shapely.geometry.base import BaseGeometry
19
+ from shapely.geometry import Point
8
20
  from tqdm import tqdm
9
- from urllib.error import URLError
10
21
  import logging
11
22
 
12
- from gigaspatial.core.io.readers import *
13
- from gigaspatial.core.io.writers import *
14
23
  from gigaspatial.core.io.data_store import DataStore
15
- from gigaspatial.core.io.local_data_store import LocalDataStore
24
+ from gigaspatial.processing.tif_processor import TifProcessor
25
+ from gigaspatial.handlers.base import (
26
+ BaseHandlerConfig,
27
+ BaseHandlerDownloader,
28
+ BaseHandlerReader,
29
+ BaseHandler,
30
+ )
16
31
  from gigaspatial.config import config as global_config
17
32
 
18
33
 
19
- class WorldPopConfig(BaseModel):
20
- # class variables
21
- _metadata_cache: ClassVar[Optional[pd.DataFrame]] = None
34
+ class WorldPopRestClient:
35
+ """
36
+ REST API client for WorldPop data access.
22
37
 
23
- # constants
24
- CURRENT_MAX_YEAR: int = 2022
25
- EARLIEST_YEAR: int = 2000
26
- SCHOOL_AGE_YEAR: int = 2020
38
+ This class provides direct access to the WorldPop REST API without any
39
+ configuration dependencies, allowing flexible integration patterns.
40
+ """
27
41
 
28
- # base config
29
- WORLDPOP_DB_BASE_URL: HttpUrl = Field(default="https://data.worldpop.org/")
30
- SCHOOL_AGE_POPULATION_PATH: str = Field(
31
- default="GIS/AgeSex_structures/school_age_population/v1/2020/"
32
- )
33
- PPP_2021_2022_PATH: str = Field(
34
- default="GIS/Population/Global_2021_2022_1km_UNadj/"
35
- )
36
- DATASETS_METADATA_PATH: str = Field(default="assets/wpgpDatasets.csv")
42
+ def __init__(
43
+ self,
44
+ base_url: str = "https://www.worldpop.org/rest/data",
45
+ stats_url: str = "https://api.worldpop.org/v1/services/stats",
46
+ api_key: Optional[str] = None,
47
+ timeout: int = 30,
48
+ logger: Optional[logging.Logger] = None,
49
+ ):
50
+ """
51
+ Initialize the WorldPop REST API client.
52
+
53
+ Args:
54
+ base_url: Base URL for the WorldPop REST API
55
+ stats_url: URL for the WorldPop statistics API
56
+ api_key: Optional API key for higher rate limits
57
+ timeout: Request timeout in seconds
58
+ logger: Optional logger instance
59
+ """
60
+ self.base_url = base_url.rstrip("/")
61
+ self.stats_url = stats_url.rstrip("/")
62
+ self.api_key = api_key
63
+ self.timeout = timeout
64
+ self.logger = logger or logging.getLogger(self.__class__.__name__)
65
+
66
+ # Setup session with default headers
67
+ self.session = requests.Session()
68
+ self.session.headers.update(
69
+ {"Accept": "application/json", "User-Agent": "WorldPop-Python-Client/1.0"}
70
+ )
71
+
72
+ if self.api_key:
73
+ self.session.headers["X-API-Key"] = self.api_key
74
+
75
+ def get_available_projects(self) -> List[Dict[str, Any]]:
76
+ """
77
+ Get list of all available projects (e.g., population, births, pregnancies, etc.).
78
+
79
+ Returns:
80
+ List of project dictionaries with alias, name, title, and description
81
+ """
82
+ try:
83
+ response = self.session.get(self.base_url, timeout=self.timeout)
84
+ response.raise_for_status()
85
+ data = response.json()
86
+ return data.get("data", [])
87
+ except requests.RequestException as e:
88
+ self.logger.error(f"Failed to fetch available project aliases: {e}")
89
+ return []
90
+
91
+ def get_project_sources(self, dataset_type: str) -> List[Dict[str, Any]]:
92
+ """
93
+ Get available sources for a specific project type.
94
+
95
+ Args:
96
+ dataset_type: Project type alias (e.g., 'pop', 'births', 'pregnancies')
97
+
98
+ Returns:
99
+ List of source dictionaries with alias and name
100
+ """
101
+ try:
102
+ url = f"{self.base_url}/{dataset_type}"
103
+ response = self.session.get(url, timeout=self.timeout)
104
+ response.raise_for_status()
105
+ data = response.json()
106
+ return data.get("data", [])
107
+ except requests.RequestException as e:
108
+ self.logger.error(
109
+ f"Failed to fetch project sources for {dataset_type}: {e}"
110
+ )
111
+ return []
112
+
113
+ def get_source_entities(
114
+ self, dataset_type: str, category: str
115
+ ) -> List[Dict[str, Any]]:
116
+ """
117
+ Get list of entities (countries, global, continental) available for a specific project type and source.
118
+
119
+ Args:
120
+ dataset_type: Project type alias (e.g., 'pop', 'births')
121
+ category: Source alias (e.g., 'wpgp', 'pic')
122
+
123
+ Returns:
124
+ List of entity dictionaries with id and iso3 codes (if applicable)
125
+ """
126
+ try:
127
+ url = f"{self.base_url}/{dataset_type}/{category}"
128
+ response = self.session.get(url, timeout=self.timeout)
129
+ response.raise_for_status()
130
+ data = response.json()
131
+ return data.get("data", [])
132
+ except requests.RequestException as e:
133
+ self.logger.error(
134
+ f"Failed to fetch entities for {dataset_type}/{category}: {e}"
135
+ )
136
+ return []
137
+
138
+ def get_datasets(self, dataset_type: str, category: str, params: dict):
139
+ """
140
+ Get all datasets available for the params.
141
+
142
+ Args:
143
+ dataset_type: Dataset type alias (e.g., 'pop', 'births')
144
+ category: Category alias (e.g., 'wpgp', 'pic')
145
+ params: Query parameters (e.g., {'iso3`:'RWA'})
146
+
147
+ Returns:
148
+ List of dataset dictionaries with metadata and file information
149
+ """
150
+ try:
151
+ url = f"{self.base_url}/{dataset_type}/{category}"
152
+ response = self.session.get(url, params=params, timeout=self.timeout)
153
+ response.raise_for_status()
154
+ data = response.json()
155
+ return data.get("data", [])
156
+ except requests.RequestException as e:
157
+ self.logger.error(f"Failed to fetch datasets for {params}: {e}")
158
+ return []
159
+
160
+ def get_datasets_by_country(
161
+ self, dataset_type: str, category: str, iso3: str
162
+ ) -> List[Dict[str, Any]]:
163
+ """
164
+ Get all datasets available for a specific country.
165
+
166
+ Args:
167
+ dataset_type: Dataset type alias (e.g., 'pop', 'births')
168
+ category: Category alias (e.g., 'wpgp', 'pic')
169
+ iso3: ISO3 country code (e.g., 'USA', 'BRA')
170
+
171
+ Returns:
172
+ List of dataset dictionaries with metadata and file information
173
+ """
174
+ params = {"iso3": iso3}
175
+ return self.get_datasets(dataset_type, category, params)
176
+
177
+ def get_dataset_by_id(
178
+ self, dataset_type: str, category: str, dataset_id: str
179
+ ) -> Optional[Dict[str, Any]]:
180
+ """
181
+ Get dataset information by ID.
182
+
183
+ Args:
184
+ dataset_type: Dataset type alias (e.g., 'pop', 'births')
185
+ category: Category alias (e.g., 'wpgp', 'pic')
186
+ dataset_id: Dataset ID
187
+
188
+ Returns:
189
+ Dataset dictionary or None if not found
190
+ """
191
+ params = {"id": dataset_id}
192
+ return self.get_datasets(dataset_type, category, params)
193
+
194
+ def find_dataset(
195
+ self,
196
+ dataset_type: str,
197
+ category: str,
198
+ iso3: str,
199
+ year: Union[str, int],
200
+ **filters,
201
+ ) -> Optional[Dict[str, Any]]:
202
+ """
203
+ Find a specific dataset by year and optional filters.
204
+
205
+ Args:
206
+ dataset_type: Dataset type alias
207
+ category: Category alias
208
+ iso3: ISO3 country code
209
+ year: Year to search for
210
+ **filters: Additional filters (e.g., gender='F', resolution='1km')
211
+
212
+ Returns:
213
+ Dataset dictionary or None if not found
214
+ """
215
+ datasets = self.get_country_datasets(dataset_type, category, iso3)
216
+ year_str = str(year)
217
+
218
+ for dataset in datasets:
219
+ if dataset.get("popyear") == year_str:
220
+ # Check additional filters
221
+ match = True
222
+ for key, value in filters.items():
223
+ if key in dataset and dataset[key] != value:
224
+ match = False
225
+ break
226
+
227
+ if match:
228
+ return dataset
229
+
230
+ return None
231
+
232
+ def list_years_for_country(
233
+ self, dataset_type: str, category: str, iso3: str
234
+ ) -> List[int]:
235
+ """
236
+ List all available years for a specific country and dataset.
237
+
238
+ Args:
239
+ dataset_type: Dataset type alias
240
+ category: Category alias
241
+ iso3: ISO3 country code
242
+
243
+ Returns:
244
+ Sorted list of available years
245
+ """
246
+ datasets = self.get_datasets_by_country(dataset_type, category, iso3)
247
+ years = []
248
+
249
+ for dataset in datasets:
250
+ try:
251
+ year = int(dataset.get("popyear", 0))
252
+ if year > 0:
253
+ years.append(year)
254
+ except (ValueError, TypeError):
255
+ continue
256
+
257
+ return sorted(years)
258
+
259
+ def search_datasets(
260
+ self,
261
+ dataset_type: Optional[str] = None,
262
+ category: Optional[str] = None,
263
+ iso3: Optional[str] = None,
264
+ year: Optional[Union[str, int]] = None,
265
+ **filters,
266
+ ) -> List[Dict[str, Any]]:
267
+ """
268
+ Search for datasets with flexible filtering.
269
+
270
+ Args:
271
+ dataset_type: Optional dataset type filter
272
+ category: Optional category filter
273
+ iso3: Optional country filter
274
+ year: Optional year filter
275
+ **filters: Additional filters
276
+
277
+ Returns:
278
+ List of matching datasets
279
+ """
280
+ results = []
281
+
282
+ if dataset_type:
283
+ if category:
284
+ # If we have country-specific filters
285
+ if iso3:
286
+ datasets = self.get_datasets_by_country(
287
+ dataset_type, category, iso3
288
+ )
289
+ for dataset in datasets:
290
+ match = True
291
+
292
+ # Check year filter
293
+ if year and dataset.get("popyear") != str(year):
294
+ match = False
295
+
296
+ # Check additional filters
297
+ for key, value in filters.items():
298
+ if key in dataset and dataset[key] != value:
299
+ match = False
300
+ break
301
+
302
+ if match:
303
+ results.append(dataset)
304
+ else:
305
+ return self.get_source_entities(dataset_type, category)
306
+ else:
307
+ return self.get_project_sources(dataset_type)
308
+ else:
309
+ return self.get_available_projects()
310
+
311
+ return results
312
+
313
+ def get_dataset_info(self, dataset: Dict[str, Any]) -> Dict[str, Any]:
314
+ """
315
+ Extract useful information from a dataset dictionary.
316
+
317
+ Args:
318
+ dataset: Dataset dictionary from API
319
+
320
+ Returns:
321
+ Cleaned dataset information
322
+ """
323
+ return {
324
+ "id": dataset.get("id"),
325
+ "title": dataset.get("title"),
326
+ "description": dataset.get("desc"),
327
+ "doi": dataset.get("doi"),
328
+ "citation": dataset.get("citation"),
329
+ "data_format": dataset.get("data_format"),
330
+ "year": dataset.get("popyear"),
331
+ "country": dataset.get("country"),
332
+ "iso3": dataset.get("iso3"),
333
+ "continent": dataset.get("continent"),
334
+ "download_urls": dataset.get("files", []),
335
+ "image_url": dataset.get("url_img"),
336
+ "summary_url": dataset.get("url_summary"),
337
+ "license": dataset.get("license"),
338
+ "organization": dataset.get("organisation"),
339
+ "author": dataset.get("author_name"),
340
+ "maintainer": dataset.get("maintainer_name"),
341
+ "project": dataset.get("project"),
342
+ "category": dataset.get("category"),
343
+ "date_created": dataset.get("date"),
344
+ "public": dataset.get("public") == "Y",
345
+ "archived": dataset.get("archive") == "Y",
346
+ }
347
+
348
+ def close(self):
349
+ """Close the session."""
350
+ self.session.close()
351
+
352
+ def __enter__(self):
353
+ """Context manager entry."""
354
+ return self
355
+
356
+ def __exit__(self, exc_type, exc_val, exc_tb):
357
+ """Context manager exit."""
358
+ self.close()
359
+
360
+
361
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
362
+ class WPPopulationConfig(BaseHandlerConfig):
363
+
364
+ client = WorldPopRestClient()
365
+
366
+ AVAILABLE_YEARS: List = Field(default=np.append(np.arange(2000, 2021), 2024))
367
+ AVAILABLE_RESOLUTIONS: List = Field(default=[100, 1000])
37
368
 
38
369
  # user config
39
370
  base_path: Path = Field(default=global_config.get_path("worldpop", "bronze"))
40
- country: str = Field(...)
41
- year: int = Field(..., ge=EARLIEST_YEAR, le=CURRENT_MAX_YEAR)
42
- resolution: Literal["HIGH", "LOW"] = Field(
43
- default="LOW",
44
- description="Spatial resolution of the population grid: HIGH (100m) or LOW (1km)",
45
- )
46
- un_adjusted: bool = True
47
- constrained: bool = False
48
- school_age: Optional[Literal["PRIMARY", "SECONDARY"]] = None
49
- gender: Literal["F", "M", "F_M"] = "F_M"
50
-
51
- @field_validator("country")
52
- def validate_country(cls, value: str) -> str:
53
- try:
54
- return pycountry.countries.lookup(value).alpha_3
55
- except LookupError:
56
- raise ValueError(f"Invalid country code provided: {value}")
371
+ project: Literal["pop", "age_structures"] = Field(...)
372
+ year: int = Field(...)
373
+ resolution: int = Field(...)
374
+ un_adjusted: bool = Field(...)
375
+ constrained: bool = Field(...)
376
+ school_age: bool = Field(...)
377
+
378
+ @field_validator("year")
379
+ def validate_year(cls, value: str) -> int:
380
+ if value in cls.AVAILABLE_YEARS:
381
+ return value
382
+ raise ValueError(
383
+ f"No datasets found for the provided year: {value}\nAvailable years are: {cls.AVAILABLE_YEARS}"
384
+ )
385
+
386
+ @field_validator("resolution")
387
+ def validate_resolution(cls, value: str) -> int:
388
+ if value in cls.AVAILABLE_RESOLUTIONS:
389
+ return value
390
+ raise ValueError(
391
+ f"No datasets found for the provided resolution: {value}\nAvailable resolutions are: {cls.AVAILABLE_RESOLUTIONS}"
392
+ )
57
393
 
58
394
  @model_validator(mode="after")
59
395
  def validate_configuration(self):
@@ -61,145 +397,191 @@ class WorldPopConfig(BaseModel):
61
397
  Validate that the configuration is valid based on dataset availability constraints.
62
398
 
63
399
  Specific rules:
64
- - Post-2020 data is only available at 1km resolution with UN adjustment
65
- - School age population data is only available for 2020 at 1km resolution
400
+ - For age_structures:
401
+ - School age data is only available for 2020 at 1km resolution.
402
+ - Non-school age data is only available at 100m resolution.
403
+ - Unconstrained, non-school age data is only available without UN adjustment.
404
+ - Constrained, non-school age data with UN adjustment is only available for 2020.
405
+ - Constrained, non-school age data without UN adjustment is only available for 2020 and 2024.
406
+ - For pop:
407
+ - 2024 data is only available at 100m resolution and without UN adjustment.
408
+ - Constrained data (other than 2024) is only available for 2020 at 100m resolution.
409
+ - Unconstrained data at 100m or 1km is available for other years, with or without UN adjustment.
66
410
  """
67
- if self.year > self.SCHOOL_AGE_YEAR:
68
- if self.resolution != "LOW":
69
- raise ValueError(
70
- f"Data for year {self.year} is only available at LOW (1km) resolution"
71
- )
72
-
73
- if not self.un_adjusted:
74
- raise ValueError(
75
- f"Data for year {self.year} is only available with UN adjustment"
76
- )
77
411
 
78
- if self.school_age:
79
- if self.resolution != "LOW":
80
- raise ValueError(
81
- f"School age data is only available at LOW (1km) resolution"
82
- )
412
+ if self.project == "age_structures":
83
413
 
84
- if self.year != self.SCHOOL_AGE_YEAR:
85
- self.year = self.SCHOOL_AGE_YEAR
86
- raise ValueError(f"School age data is only available for 2020")
414
+ if self.school_age:
415
+ if self.resolution == 100:
416
+ self.logger.warning(
417
+ "School age population datasets are only available at 1km `resolution`, resolution is set as 1000"
418
+ )
419
+ self.resolution = 1000
87
420
 
88
- return self
421
+ if self.year != 2020:
422
+ self.logger.warning(
423
+ "School age population datasets are only available for 2020, `year` is set as 2020"
424
+ )
425
+ self.year = 2020
89
426
 
90
- @property
91
- def dataset_url(self) -> str:
92
- """Get the URL for the configured dataset. The URL is computed on first access and then cached for subsequent calls."""
93
- if not hasattr(self, "_dataset_url"):
94
- self._dataset_url = self._compute_dataset_url()
95
- return self._dataset_url
96
-
97
- @property
98
- def dataset_path(self) -> Path:
99
- """Construct and return the path for the configured dataset."""
100
- url_parts = self.dataset_url.split("/")
101
- file_path = (
102
- "/".join(
103
- [url_parts[4], url_parts[5], url_parts[7], self.country, url_parts[-1]]
104
- )
105
- if self.school_age
106
- else "/".join([url_parts[4], url_parts[6], self.country, url_parts[-1]])
107
- )
108
- return self.base_path / file_path
427
+ if self.un_adjusted:
428
+ self.logger.warning(
429
+ "School age population datasets are only available without UN adjustment, `un_adjusted` is set as False"
430
+ )
431
+ self.un_adjusted = False
109
432
 
110
- def _load_datasets_metadata(self) -> pd.DataFrame:
111
- """Load and return the WorldPop datasets metadata, using cache if available."""
112
- if WorldPopConfig._metadata_cache is not None:
113
- return WorldPopConfig._metadata_cache
433
+ if self.constrained:
434
+ self.logger.warning(
435
+ "School age population datasets are only available unconstrained, `constrained` is set as False"
436
+ )
437
+ self.constrained = False
114
438
 
115
- try:
116
- WorldPopConfig._metadata_cache = pd.read_csv(
117
- str(self.WORLDPOP_DB_BASE_URL) + self.DATASETS_METADATA_PATH
118
- )
119
- return WorldPopConfig._metadata_cache
120
- except (URLError, pd.errors.EmptyDataError) as e:
121
- raise RuntimeError(f"Failed to load WorldPop datasets metadata: {e}")
122
-
123
- def _compute_dataset_url(self) -> str:
124
- """Construct and return the URL for the configured dataset."""
125
- # handle post-2020 datasets
126
- if self.year > self.SCHOOL_AGE_YEAR:
127
- return (
128
- str(self.WORLDPOP_DB_BASE_URL)
129
- + self.PPP_2021_2022_PATH
130
- + f"{'' if self.constrained else 'un'}constrained/{self.year}/{self.country}/{self.country.lower()}_ppp_{self.year}_1km_UNadj{'_constrained' if self.constrained else ''}.tif"
131
- )
439
+ self.dataset_category = "sapya1km"
440
+ else:
441
+ if self.resolution == 1000:
442
+ self.logger.warning(
443
+ "Age structures datasets are only available at 100m resolution, `resolution` is set as 100"
444
+ )
445
+ self.resolution = 100
132
446
 
133
- # handle school-age population datasets
134
- if self.school_age:
135
- return (
136
- str(self.WORLDPOP_DB_BASE_URL)
137
- + self.SCHOOL_AGE_POPULATION_PATH
138
- + f"{self.country}/{self.country}_SAP_1km_2020/{self.country}_{self.gender}_{self.school_age}_2020_1km.tif"
139
- )
447
+ if not self.constrained:
448
+ if self.un_adjusted:
449
+ self.logger.warning(
450
+ "Age structures unconstrained datasets are only available without UN adjustment, `un_adjusted` is set as False"
451
+ )
452
+ self.un_adjusted = False
140
453
 
141
- # handle standard population datasets
142
- wp_metadata = self._load_datasets_metadata()
454
+ self.dataset_category = (
455
+ "G2_UC_Age_2024_100m" if self.year == 2024 else "aswpgp"
456
+ )
457
+ else:
458
+ if self.un_adjusted:
459
+ if self.year != 2020:
460
+ self.logger.warning(
461
+ "Age structures constrained datasets with UN adjustment are only available for 2020, `year` is set as 2020"
462
+ )
463
+ self.year = 2020
464
+ self.dataset_category = "ascicua_2020"
465
+ else:
466
+ if self.year == 2024:
467
+ self.dataset_category = "G2_CN_Age_2024_100m"
468
+ elif self.year == 2020:
469
+ self.dataset_category = "ascic_2020"
470
+ else:
471
+ raise ValueError(
472
+ "Age structures constrained datasets without UN adjustment are only available for 2020 and 2024, please set `year` to one of the available options: 2020, 2024"
473
+ )
474
+
475
+ elif self.project == "pop":
476
+
477
+ if self.school_age:
478
+ raise ValueError(
479
+ f"""
480
+ Received unexpected value of `{self.school_age}` for project: `{self.project}`.
481
+ For school age population datasets, please set project as `age_structures`.
482
+ """
483
+ )
143
484
 
144
- try:
145
- dataset_url = (
146
- self.WORLDPOP_DB_BASE_URL
147
- + wp_metadata[
148
- (wp_metadata.ISO3 == self.country)
149
- & (
150
- wp_metadata.Covariate
151
- == "ppp_"
152
- + str(self.year)
153
- + ("_UNadj" if self.un_adjusted else "")
485
+ if self.year == 2024:
486
+ if self.resolution == 1000:
487
+ self.logger.warning(
488
+ "2024 datasets are only available at 100m resolution, `resolution` is set as 100m"
154
489
  )
155
- ].PathToRaster.values[0]
156
- )
157
- except IndexError:
158
- raise ValueError(
159
- f"No dataset found for country={self.country}, year={self.year}, un_adjusted={self.un_adjusted}"
160
- )
490
+ self.resolution = 100
491
+ if self.un_adjusted:
492
+ self.logger.warning(
493
+ "2024 datasets are only available without UN adjustment, `un_adjusted` is set as False"
494
+ )
495
+ self.un_adjusted = False
161
496
 
162
- # handle resolution conversion if needed
163
- if self.resolution == "HIGH":
164
- return dataset_url
497
+ self.dataset_category = (
498
+ "G2_CN_POP_2024_100m" if self.constrained else "G2_UC_POP_2024_100m"
499
+ )
500
+ else:
501
+ if self.constrained:
502
+ if self.year != 2020:
503
+ self.logger.warning(
504
+ "Population constrained datasets are only available for 2020, `year` is set as 2020"
505
+ )
506
+ self.year = 2020
507
+
508
+ if self.resolution != 100:
509
+ self.logger.warning(
510
+ "Population constrained datasets are only available at 100m resolution, `resolution` is set as 100"
511
+ )
512
+ self.resolution = 100
513
+
514
+ self.dataset_category = (
515
+ "cic2020_UNadj_100m" if self.un_adjusted else "cic2020_100m"
516
+ )
517
+ else:
518
+ if self.resolution == 100:
519
+ self.dataset_category = (
520
+ f"wpgp{'unadj' if self.un_adjusted else ''}"
521
+ )
522
+ else:
523
+ self.dataset_category = (
524
+ "wpic1km" if not self.un_adjusted else "wpicuadj1km"
525
+ )
526
+
527
+ def get_relevant_data_units_by_geometry(
528
+ self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
529
+ ) -> List[Dict[str, Any]]:
530
+ raise NotImplementedError(
531
+ "WorldPop does not support geometry-based filtering. "
532
+ "Please use country-based filtering or direct resource filtering instead."
533
+ )
165
534
 
166
- url_parts = dataset_url.split("/")
167
- url_parts[5] = (
168
- url_parts[5] + "_1km" + ("_UNadj" if self.un_adjusted else "")
169
- ) # get 1km folder with UNadj specification
170
- url_parts[8] = url_parts[8].replace(
171
- str(self.year), str(self.year) + "_1km_Aggregated"
172
- ) # get filename with 1km res
173
- dataset_url = "/".join(url_parts)
535
+ def get_relevant_data_units_by_points(
536
+ self, points: List[Union[Point, tuple]], **kwargs
537
+ ) -> List[Dict[str, Any]]:
538
+ raise NotImplementedError(
539
+ "WorldPop does not support point-based filtering. "
540
+ "Please use country-based filtering or direct resource filtering instead."
541
+ )
174
542
 
175
- return dataset_url
543
+ def get_relevant_data_units_by_country(
544
+ self, country: str, **kwargs
545
+ ) -> List[Dict[str, Any]]:
546
+ iso3 = pycountry.countries.lookup(country).alpha_3
176
547
 
177
- def __repr__(self) -> str:
548
+ datasets = self.client.search_datasets(
549
+ self.project, self.dataset_category, iso3, self.year
550
+ )
178
551
 
179
- parts = [
180
- f"WorldpopConfig(",
181
- f" country='{self.country}'",
182
- f" year={self.year}",
183
- f" resolution={self.resolution}",
184
- f" un_adjusted={self.un_adjusted}",
185
- f" constrained={self.constrained}",
552
+ files = [
553
+ file
554
+ for file in datasets[0].get("files", [])
555
+ if ((self.dataset_category == "sapya1km") or file.endswith(".tif"))
186
556
  ]
187
557
 
188
- if self.school_age:
189
- parts.append(f" school_age='{self.school_age}'")
190
- parts.append(f" gender='{self.gender}'")
558
+ return files
191
559
 
192
- parts.append(")")
560
+ def get_data_unit_path(self, unit: str, **kwargs) -> Path:
561
+ """
562
+ Given a WP file url, return the corresponding path.
563
+ """
564
+ return self.base_path / unit.split("GIS/")[1]
193
565
 
194
- return "\n".join(parts)
566
+ def __repr__(self) -> str:
195
567
 
568
+ return (
569
+ f"WPPopulationConfig(",
570
+ f"project={self.project}, "
571
+ f"year={self.year}, "
572
+ f"resolution={self.resolution}, "
573
+ f"un_adjusted={self.un_adjusted}, "
574
+ f"constrained={self.constrained}, "
575
+ f"school_age={self.school_age}, "
576
+ f")",
577
+ )
196
578
 
197
- class WorldPopDownloader:
198
- """A class to handle downloads of WorldPop datasets."""
579
+
580
+ class WPPopulationDownloader(BaseHandlerDownloader):
199
581
 
200
582
  def __init__(
201
583
  self,
202
- config: Union[WorldPopConfig, dict[str, Union[str, int]]],
584
+ config: Union[WPPopulationConfig, dict[str, Union[str, int]]],
203
585
  data_store: Optional[DataStore] = None,
204
586
  logger: Optional[logging.Logger] = None,
205
587
  ):
@@ -207,60 +589,263 @@ class WorldPopDownloader:
207
589
  Initialize the downloader.
208
590
 
209
591
  Args:
210
- config: Configuration for the WorldPop dataset, either as a WorldPopConfig object or a dictionary of parameters
592
+ config: Configuration for the WorldPop dataset, either as a WPPopulationConfig object or a dictionary of parameters
211
593
  data_store: Optional data storage interface. If not provided, uses LocalDataStore.
212
594
  logger: Optional custom logger. If not provided, uses default logger.
213
595
  """
214
- self.logger = logger or global_config.get_logger(self.__class__.__name__)
215
- self.data_store = data_store or LocalDataStore()
216
- self.config = (
217
- config if isinstance(config, WorldPopConfig) else WorldPopConfig(**config)
596
+ config = (
597
+ config
598
+ if isinstance(config, WPPopulationConfig)
599
+ else WPPopulationConfig(**config)
218
600
  )
601
+ super().__init__(config=config, data_store=data_store, logger=logger)
219
602
 
220
- @classmethod
221
- def from_country_year(cls, country: str, year: int, **kwargs):
222
- """
223
- Create a downloader instance from country and year.
224
-
225
- Args:
226
- country: Country code or name
227
- year: Year of the dataset
228
- **kwargs: Additional parameters for WorldPopConfig or the downloader
229
- """
230
- return cls({"country": country, "year": year}, **kwargs)
231
-
232
- def download_dataset(self) -> str:
233
- """
234
- Download the configured dataset to the provided output path.
235
- """
236
-
603
+ def download_data_unit(self, url, **kwargs):
604
+ """Download data file for a url."""
237
605
  try:
238
- response = requests.get(self.config.dataset_url, stream=True)
606
+ response = self.config.client.session.get(
607
+ url, stream=True, timeout=self.config.client.timeout
608
+ )
239
609
  response.raise_for_status()
240
610
 
241
- output_path = str(self.config.dataset_path)
242
-
243
611
  total_size = int(response.headers.get("content-length", 0))
612
+ file_path = self.config.get_data_unit_path(url)
244
613
 
245
- with self.data_store.open(output_path, "wb") as file:
614
+ with self.data_store.open(file_path, "wb") as file:
246
615
  with tqdm(
247
616
  total=total_size,
248
617
  unit="B",
249
618
  unit_scale=True,
250
- desc=f"Downloading {os.path.basename(output_path)}",
619
+ desc=f"Downloading {os.path.basename(file_path)}",
251
620
  ) as pbar:
252
621
  for chunk in response.iter_content(chunk_size=8192):
253
622
  if chunk:
254
623
  file.write(chunk)
255
624
  pbar.update(len(chunk))
256
625
 
257
- self.logger.debug(f"Successfully downloaded dataset: {self.config}")
626
+ self.logger.info(f"Successfully downloaded: {file_path}")
627
+ return file_path
258
628
 
259
- return output_path
260
-
261
- except requests.exceptions.RequestException as e:
262
- self.logger.error(f"Failed to download dataset {self.config}: {str(e)}")
629
+ except requests.RequestException as e:
630
+ self.logger.error(f"Failed to download {url}: {e}")
263
631
  return None
264
632
  except Exception as e:
265
- self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
633
+ self.logger.error(f"Unexpected error downloading {url}: {e}")
266
634
  return None
635
+
636
+ def download_data_units(
637
+ self,
638
+ urls: List[str],
639
+ ) -> List[str]:
640
+ """Download data files for multiple urls."""
641
+
642
+ with multiprocessing.Pool(self.config.n_workers) as pool:
643
+ download_func = functools.partial(self.download_data_unit)
644
+ file_paths = list(
645
+ tqdm(
646
+ pool.imap(download_func, urls),
647
+ total=len(urls),
648
+ desc=f"Downloading data",
649
+ )
650
+ )
651
+
652
+ return [path for path in file_paths if path is not None]
653
+
654
+ def download(self, source: str, **kwargs) -> List[str]:
655
+ """Download data for a source"""
656
+ resources = self.config.get_relevant_data_units(source, **kwargs)
657
+ return self.download_data_units(resources)
658
+
659
+
660
+ class WPPopulationReader(BaseHandlerReader):
661
+
662
+ def __init__(
663
+ self,
664
+ config: Union[WPPopulationConfig, dict[str, Union[str, int]]],
665
+ data_store: Optional[DataStore] = None,
666
+ logger: Optional[logging.Logger] = None,
667
+ ):
668
+ """
669
+ Initialize the reader.
670
+
671
+ Args:
672
+ config: Configuration for the WorldPop dataset, either as a WPPopulationConfig object or a dictionary of parameters
673
+ data_store: Optional data storage interface. If not provided, uses LocalDataStore.
674
+ logger: Optional custom logger. If not provided, uses default logger.
675
+ """
676
+ config = (
677
+ config
678
+ if isinstance(config, WPPopulationConfig)
679
+ else WPPopulationConfig(**config)
680
+ )
681
+ super().__init__(config=config, data_store=data_store, logger=logger)
682
+
683
+ def load_from_paths(
684
+ self, source_data_path: List[Union[str, Path]], **kwargs
685
+ ) -> List[TifProcessor]:
686
+ """
687
+ Load TifProcessors of WP datasets.
688
+ Args:
689
+ source_data_path: List of file paths to load
690
+ Returns:
691
+ List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
692
+ """
693
+ return self._load_raster_data(raster_paths=source_data_path)
694
+
695
+
696
+ class WPPopulationHandler(BaseHandler):
697
+ """
698
+ Handler for WorldPop Populations datasets.
699
+
700
+ This class provides a unified interface for downloading and loading WP Population data.
701
+ It manages the lifecycle of configuration, downloading, and reading components.
702
+ """
703
+
704
+ def __init__(
705
+ self,
706
+ project: Literal["pop", "age_structures"] = "pop",
707
+ year: int = 2020,
708
+ resolution: int = 1000,
709
+ un_adjusted: bool = True,
710
+ constrained: bool = False,
711
+ school_age: bool = False,
712
+ config: Optional[WPPopulationConfig] = None,
713
+ downloader: Optional[WPPopulationDownloader] = None,
714
+ reader: Optional[WPPopulationReader] = None,
715
+ data_store: Optional[DataStore] = None,
716
+ logger: Optional[logging.Logger] = None,
717
+ **kwargs,
718
+ ):
719
+ self._project = project
720
+ self._year = year
721
+ self._resolution = resolution
722
+ self._un_adjusted = un_adjusted
723
+ self._constrained = constrained
724
+ self._school_age = school_age
725
+ super().__init__(
726
+ config=config,
727
+ downloader=downloader,
728
+ reader=reader,
729
+ data_store=data_store,
730
+ logger=logger,
731
+ )
732
+
733
+ def create_config(
734
+ self, data_store: DataStore, logger: logging.Logger, **kwargs
735
+ ) -> WPPopulationConfig:
736
+ """
737
+ Create and return a WPPopulationConfig instance.
738
+
739
+ Args:
740
+ data_store: The data store instance to use
741
+ logger: The logger instance to use
742
+ **kwargs: Additional configuration parameters
743
+
744
+ Returns:
745
+ Configured WPPopulationConfig instance
746
+ """
747
+ return WPPopulationConfig(
748
+ project=self._project,
749
+ year=self._year,
750
+ resolution=self._resolution,
751
+ un_adjusted=self._un_adjusted,
752
+ constrained=self._constrained,
753
+ school_age=self._school_age,
754
+ data_store=data_store,
755
+ logger=logger,
756
+ **kwargs,
757
+ )
758
+
759
+ def create_downloader(
760
+ self,
761
+ config: WPPopulationConfig,
762
+ data_store: DataStore,
763
+ logger: logging.Logger,
764
+ **kwargs,
765
+ ) -> WPPopulationDownloader:
766
+ """
767
+ Create and return a WPPopulationDownloader instance.
768
+
769
+ Args:
770
+ config: The configuration object
771
+ data_store: The data store instance to use
772
+ logger: The logger instance to use
773
+ **kwargs: Additional downloader parameters
774
+
775
+ Returns:
776
+ Configured WPPopulationDownloader instance
777
+ """
778
+ return WPPopulationDownloader(
779
+ config=config, data_store=data_store, logger=logger, **kwargs
780
+ )
781
+
782
+ def create_reader(
783
+ self,
784
+ config: WPPopulationConfig,
785
+ data_store: DataStore,
786
+ logger: logging.Logger,
787
+ **kwargs,
788
+ ) -> WPPopulationReader:
789
+ """
790
+ Create and return a WPPopulationReader instance.
791
+
792
+ Args:
793
+ config: The configuration object
794
+ data_store: The data store instance to use
795
+ logger: The logger instance to use
796
+ **kwargs: Additional reader parameters
797
+
798
+ Returns:
799
+ Configured WPPopulationReader instance
800
+ """
801
+ return WPPopulationReader(
802
+ config=config, data_store=data_store, logger=logger, **kwargs
803
+ )
804
+
805
+ def load_into_dataframe(
806
+ self,
807
+ source: str,
808
+ ensure_available: bool = True,
809
+ **kwargs,
810
+ ) -> pd.DataFrame:
811
+ """
812
+ Load GHSL data into a pandas DataFrame.
813
+
814
+ Args:
815
+ source: The data source specification
816
+ ensure_available: If True, ensure data is downloaded before loading
817
+ **kwargs: Additional parameters passed to load methods
818
+
819
+ Returns:
820
+ DataFrame containing the GHSL data
821
+ """
822
+ tif_processors = self.load_data(
823
+ source=source, ensure_available=ensure_available, **kwargs
824
+ )
825
+ return pd.concat(
826
+ [tp.to_dataframe() for tp in tif_processors], ignore_index=True
827
+ )
828
+
829
+ def load_into_geodataframe(
830
+ self,
831
+ source: str,
832
+ ensure_available: bool = True,
833
+ **kwargs,
834
+ ) -> gpd.GeoDataFrame:
835
+ """
836
+ Load GHSL data into a geopandas GeoDataFrame.
837
+
838
+ Args:
839
+ source: The data source specification
840
+ ensure_available: If True, ensure data is downloaded before loading
841
+ **kwargs: Additional parameters passed to load methods
842
+
843
+ Returns:
844
+ GeoDataFrame containing the GHSL data
845
+ """
846
+ tif_processors = self.load_data(
847
+ source=source, ensure_available=ensure_available, **kwargs
848
+ )
849
+ return pd.concat(
850
+ [tp.to_geodataframe() for tp in tif_processors], ignore_index=True
851
+ )