giga-spatial 0.6.2__py3-none-any.whl → 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.3.dist-info}/METADATA +18 -8
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.3.dist-info}/RECORD +11 -11
- gigaspatial/__init__.py +1 -1
- gigaspatial/handlers/__init__.py +2 -2
- gigaspatial/handlers/ghsl.py +7 -6
- gigaspatial/handlers/hdx.py +414 -145
- gigaspatial/handlers/rwi.py +119 -121
- gigaspatial/processing/tif_processor.py +88 -2
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.3.dist-info}/WHEEL +0 -0
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.3.dist-info}/licenses/LICENSE +0 -0
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.3.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: giga-spatial
|
3
|
-
Version: 0.6.
|
3
|
+
Version: 0.6.3
|
4
4
|
Summary: A package for spatial data download & processing
|
5
5
|
Home-page: https://github.com/unicef/giga-spatial
|
6
6
|
Author: Utku Can Ozturk
|
@@ -51,17 +51,27 @@ Dynamic: summary
|
|
51
51
|
|
52
52
|
# GigaSpatial
|
53
53
|
|
54
|
-
|
55
|
-
Giga is a UNICEF-ITU initiative to connect every school to the Internet and every young person to information, opportunity and choice.
|
56
|
-
Giga maps schools' Internet access in real time, creates models for innovative financing, and supports governments contracting for connectivity.
|
54
|
+
## About Giga
|
57
55
|
|
58
|
-
|
56
|
+
[Giga](https://giga.global/) is a UNICEF-ITU initiative to connect every school to the Internet and every young person to information, opportunity and choice.
|
57
|
+
Giga maps schools' Internet access in real time, creates models for innovative financing, and supports governments contracting for connectivity.
|
59
58
|
|
60
59
|
## About GigaSpatial
|
61
60
|
|
62
61
|
**GigaSpatial** is a Python package developed as part of the Giga Applied Science Team to handle geospatial data efficiently. It provides tools for downloading, processing, and analyzing geospatial data, enabling users to work with datasets such as OpenStreetMap (OSM), Global Human Settlement Layer (GHSL), Microsoft Global Buildings, Google Open Buildings, and more. The package is designed to support Giga's mission by providing robust geospatial capabilities for mapping and analyzing school connectivity.
|
63
62
|
|
64
|
-
|
63
|
+
## Installation
|
64
|
+
|
65
|
+
See the [installation docs](https://unicef.github.io/giga-spatial/getting-started/installation/) for all details. GigaSpatial requires Python 3.10 or above and depends on the following key packages:
|
66
|
+
|
67
|
+
- geopandas
|
68
|
+
- pandas
|
69
|
+
- shapely
|
70
|
+
- rasterio
|
71
|
+
|
72
|
+
We recommend using a virtual environment for installation. See the [installation docs](https://unicef.github.io/giga-spatial/getting-started/installation/) for more details.
|
73
|
+
|
74
|
+
## Key Features
|
65
75
|
- **Data Downloading**: Download geospatial data from various sources including GHSL, Microsoft Global Buildings, Google Open Buildings, OpenCellID, and HDX datasets.
|
66
76
|
- **Data Processing**: Process and transform geospatial data, such as GeoTIFF files and vector data, with support for compression and efficient handling.
|
67
77
|
- **View Generators**:
|
@@ -74,7 +84,7 @@ Giga maps schools' Internet access in real time, creates models for innovative f
|
|
74
84
|
- Centralized configuration via environment variables or `.env` file
|
75
85
|
- Easy setup of API keys and paths
|
76
86
|
|
77
|
-
|
87
|
+
## Supported Datasets
|
78
88
|
|
79
89
|
The `gigaspatial` package supports data from the following providers:
|
80
90
|
|
@@ -84,7 +94,7 @@ The `gigaspatial` package supports data from the following providers:
|
|
84
94
|
|
85
95
|
---
|
86
96
|
|
87
|
-
|
97
|
+
## View Generators
|
88
98
|
|
89
99
|
The **view generators** in GigaSpatial are designed to enrich the spatial context of school locations and map data into grid or POI locations. This enables users to analyze and visualize geospatial data in meaningful ways.
|
90
100
|
|
@@ -1,5 +1,5 @@
|
|
1
|
-
giga_spatial-0.6.
|
2
|
-
gigaspatial/__init__.py,sha256=
|
1
|
+
giga_spatial-0.6.3.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
2
|
+
gigaspatial/__init__.py,sha256=zYiFHqR7JwbvdK9dvKrh-RTNfUqjHUwC4CTcFAPVYLc,22
|
3
3
|
gigaspatial/config.py,sha256=yMf1ofOU0_I6iKDqshiFSYmK6TDIVpPm1AZo4e2okHU,8166
|
4
4
|
gigaspatial/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
gigaspatial/core/io/__init__.py,sha256=y4QNWx6US1-adTuAO_NZwLmjzSQj25HNDL5hUGvEHZc,263
|
@@ -19,13 +19,13 @@ gigaspatial/generators/zonal/geometry.py,sha256=XPcX5lT7X7Z1vn72sN-VKLb2hDP9F_w3
|
|
19
19
|
gigaspatial/generators/zonal/mercator.py,sha256=R_KlaqF4lnc0cRqVfcNVO8i0Re21_6w7pnclVKSohcY,3125
|
20
20
|
gigaspatial/grid/__init__.py,sha256=H8SnNAMDafJXJ9bUp2zU0Z3t6s8niqY5rGP5nFhnbLA,45
|
21
21
|
gigaspatial/grid/mercator_tiles.py,sha256=Z_3M4sy1tyxywAo2wmBb6niBP3x-IWgwMkmUp8LOSDg,10492
|
22
|
-
gigaspatial/handlers/__init__.py,sha256=
|
22
|
+
gigaspatial/handlers/__init__.py,sha256=pqK3rJtelOAkBaWNhpGy2t-p_zrwO-9BqABLQufTXF0,1449
|
23
23
|
gigaspatial/handlers/base.py,sha256=rL94c3wDjsqzLp4na8FfYXW6tNjVGX6v4M-Ce4LrAro,26413
|
24
24
|
gigaspatial/handlers/boundaries.py,sha256=hoO-b5MlFYwlCWogApcFyEx6OnxMJG29lqJurNGwOWg,11260
|
25
|
-
gigaspatial/handlers/ghsl.py,sha256=
|
25
|
+
gigaspatial/handlers/ghsl.py,sha256=GHao8lkmj1C0-QFqNwH9jr0Lqzu6NTj_7ooQdj1h6ok,27760
|
26
26
|
gigaspatial/handlers/giga.py,sha256=2aP1EenDAQXn-h-uCyuVxEVZvAFEvrL17_z0MiS8FDs,4867
|
27
27
|
gigaspatial/handlers/google_open_buildings.py,sha256=Liqk7qJhDtB4Ia4uhBe44LFcf-XVKBjRfj-pWlE5erY,16594
|
28
|
-
gigaspatial/handlers/hdx.py,sha256=
|
28
|
+
gigaspatial/handlers/hdx.py,sha256=DNw-LhxuJU3eNGihQGyPJT0a1PaOCupNHr7BDGal4Zo,18088
|
29
29
|
gigaspatial/handlers/mapbox_image.py,sha256=M_nkJ_b1PD8FG1ajVgSycCb0NRTAI_SLpHdzszNetKA,7786
|
30
30
|
gigaspatial/handlers/maxar_image.py,sha256=g5YVGV-8JjeG9bGBOp7ZfKani22J4izXX4hnB9A99Jk,10272
|
31
31
|
gigaspatial/handlers/microsoft_global_buildings.py,sha256=bQ5WHIv3v0wWrZZUbZkKPRjgdlqIxlK7CV_0zSvdrTw,20292
|
@@ -33,15 +33,15 @@ gigaspatial/handlers/ookla_speedtest.py,sha256=EcvSAxJZ9GPfzYnT_C85Qgy2ecc9ndf70
|
|
33
33
|
gigaspatial/handlers/opencellid.py,sha256=KuJqd-5-RO5ZzyDaBSrTgCK2ib5N_m3RUcPlX5heWwI,10683
|
34
34
|
gigaspatial/handlers/osm.py,sha256=sLNMkOVh1v50jrWw7Z0-HILY5QTQjgKCHCeAfXj5jA8,14084
|
35
35
|
gigaspatial/handlers/overture.py,sha256=lKeNw00v5Qia7LdWORuYihnlKEqxE9m38tdeRrvag9k,4218
|
36
|
-
gigaspatial/handlers/rwi.py,sha256=
|
36
|
+
gigaspatial/handlers/rwi.py,sha256=GDpQH9K96QZD3yezJOBiy5yZvYmrj4xbjUNSjYfNAh0,4875
|
37
37
|
gigaspatial/handlers/unicef_georepo.py,sha256=ODYNvkU_UKgOHXT--0MqmJ4Uk6U1_mp9xgehbTzKpX8,31924
|
38
38
|
gigaspatial/handlers/worldpop.py,sha256=oJ39NGajXi0rn829ZoFiaeG4_wavyPvljdActpxs12I,9850
|
39
39
|
gigaspatial/processing/__init__.py,sha256=QDVL-QbLCrIb19lrajP7LrHNdGdnsLeGcvAs_jQpdRM,183
|
40
40
|
gigaspatial/processing/geo.py,sha256=D-S3IlhQwLIxrCcxy6NhNmKLrOIjoRHfK_eZJGKpe2U,36947
|
41
41
|
gigaspatial/processing/sat_images.py,sha256=YUbH5MFNzl6NX49Obk14WaFcr1s3SyGJIOk-kRpbBNg,1429
|
42
|
-
gigaspatial/processing/tif_processor.py,sha256=
|
42
|
+
gigaspatial/processing/tif_processor.py,sha256=zqcP_ioo9KHNJ6H0uba4UghW4MToTRwq1iE-nZbb8zA,21101
|
43
43
|
gigaspatial/processing/utils.py,sha256=HC85vGKQakxlkoQAkZmeAXWHsenAwTIRn7jPKUA7x20,1500
|
44
|
-
giga_spatial-0.6.
|
45
|
-
giga_spatial-0.6.
|
46
|
-
giga_spatial-0.6.
|
47
|
-
giga_spatial-0.6.
|
44
|
+
giga_spatial-0.6.3.dist-info/METADATA,sha256=Aw5adPdTcA3AuJBmZgAG4rJQYW4dJqw2GT90mYE7cgU,7467
|
45
|
+
giga_spatial-0.6.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
46
|
+
giga_spatial-0.6.3.dist-info/top_level.txt,sha256=LZsccgw6H4zXT7m6Y4XChm-Y5LjHAwZ2hkGN_B3ExmI,12
|
47
|
+
giga_spatial-0.6.3.dist-info/RECORD,,
|
gigaspatial/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.6.
|
1
|
+
__version__ = "0.6.3"
|
gigaspatial/handlers/__init__.py
CHANGED
@@ -31,8 +31,8 @@ from gigaspatial.handlers.opencellid import (
|
|
31
31
|
OpenCellIDDownloader,
|
32
32
|
OpenCellIDReader,
|
33
33
|
)
|
34
|
-
from gigaspatial.handlers.hdx import HDXConfig, HDXDownloader, HDXReader
|
35
|
-
from gigaspatial.handlers.rwi import RWIConfig,
|
34
|
+
from gigaspatial.handlers.hdx import HDXConfig, HDXDownloader, HDXReader, HDXHandler
|
35
|
+
from gigaspatial.handlers.rwi import RWIConfig, RWIDownloader, RWIReader, RWIHandler
|
36
36
|
from gigaspatial.handlers.unicef_georepo import (
|
37
37
|
GeoRepoClient,
|
38
38
|
get_country_boundaries_by_iso3,
|
gigaspatial/handlers/ghsl.py
CHANGED
@@ -74,8 +74,6 @@ class GHSLDataConfig(BaseHandlerConfig):
|
|
74
74
|
|
75
75
|
def __post_init__(self):
|
76
76
|
super().__post_init__()
|
77
|
-
self.TILES_URL = self.TILES_URL.format(self.coord_system.value)
|
78
|
-
self._load_tiles()
|
79
77
|
|
80
78
|
def _load_tiles(self):
|
81
79
|
"""Load GHSL tiles from tiles shapefile."""
|
@@ -158,6 +156,9 @@ class GHSLDataConfig(BaseHandlerConfig):
|
|
158
156
|
)
|
159
157
|
self.coord_system = CoordSystem.Mollweide
|
160
158
|
|
159
|
+
self.TILES_URL = self.TILES_URL.format(self.coord_system.value)
|
160
|
+
self._load_tiles()
|
161
|
+
|
161
162
|
return self
|
162
163
|
|
163
164
|
@property
|
@@ -176,7 +177,7 @@ class GHSLDataConfig(BaseHandlerConfig):
|
|
176
177
|
self, points: Iterable[Union[Point, tuple]], **kwargs
|
177
178
|
) -> List[dict]:
|
178
179
|
"""
|
179
|
-
Return intersecting tiles
|
180
|
+
Return intersecting tiles f or a list of points.
|
180
181
|
"""
|
181
182
|
return self._get_relevant_tiles(points)
|
182
183
|
|
@@ -240,8 +241,8 @@ class GHSLDataConfig(BaseHandlerConfig):
|
|
240
241
|
ValueError: If the input `source` is not one of the supported types.
|
241
242
|
"""
|
242
243
|
if isinstance(source, gpd.GeoDataFrame):
|
243
|
-
if source.crs != "EPSG:4326":
|
244
|
-
|
244
|
+
# if source.crs != "EPSG:4326":
|
245
|
+
# source = source.to_crs("EPSG:4326")
|
245
246
|
search_geom = source.geometry.unary_union
|
246
247
|
elif isinstance(
|
247
248
|
source,
|
@@ -282,7 +283,7 @@ class GHSLDataConfig(BaseHandlerConfig):
|
|
282
283
|
else ("3ss" if self.resolution == 100 else "30ss")
|
283
284
|
)
|
284
285
|
product_folder = f"{self.product}_GLOBE_{self.release}"
|
285
|
-
product_name = f"{self.product}_E{self.year}_GLOBE_{self.release}_{self.coord_system}_{resolution_str}"
|
286
|
+
product_name = f"{self.product}_E{self.year}_GLOBE_{self.release}_{self.coord_system.value}_{resolution_str}"
|
286
287
|
product_version = 2 if self.product == "GHS_SMOD" else 1
|
287
288
|
|
288
289
|
return {
|
gigaspatial/handlers/hdx.py
CHANGED
@@ -1,88 +1,77 @@
|
|
1
1
|
import os
|
2
2
|
import logging
|
3
|
+
from tqdm import tqdm
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import List, Optional, Union, Dict, Any
|
5
|
+
from typing import List, Optional, Tuple, Union, Dict, Any, Iterable
|
5
6
|
import tempfile
|
7
|
+
import functools
|
8
|
+
import multiprocessing
|
6
9
|
|
7
10
|
import pandas as pd
|
8
11
|
import geopandas as gpd
|
9
|
-
from pydantic import
|
12
|
+
from pydantic import Field, ConfigDict
|
13
|
+
from pydantic.dataclasses import dataclass
|
14
|
+
from shapely.geometry.base import BaseGeometry
|
15
|
+
from shapely.geometry import Point
|
16
|
+
import pycountry
|
10
17
|
|
11
18
|
from hdx.api.configuration import Configuration
|
12
19
|
from hdx.data.dataset import Dataset
|
13
20
|
from hdx.data.resource import Resource
|
14
21
|
|
15
22
|
from gigaspatial.core.io.data_store import DataStore
|
16
|
-
from gigaspatial.core.io.local_data_store import LocalDataStore
|
17
23
|
from gigaspatial.core.io.readers import read_dataset
|
18
24
|
from gigaspatial.config import config as global_config
|
25
|
+
from gigaspatial.handlers.base import (
|
26
|
+
BaseHandlerConfig,
|
27
|
+
BaseHandlerDownloader,
|
28
|
+
BaseHandlerReader,
|
29
|
+
BaseHandler,
|
30
|
+
)
|
19
31
|
|
20
32
|
|
21
|
-
|
33
|
+
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
34
|
+
class HDXConfig(BaseHandlerConfig):
|
22
35
|
"""Configuration for HDX data access"""
|
23
36
|
|
24
37
|
# User configuration
|
25
|
-
dataset_name: str = Field(
|
38
|
+
dataset_name: str = Field(
|
39
|
+
default=..., description="Name of the HDX dataset to download"
|
40
|
+
)
|
41
|
+
|
42
|
+
# Optional configuration with defaults
|
26
43
|
base_path: Path = Field(default=global_config.get_path("hdx", "bronze"))
|
27
44
|
user_agent: str = Field(
|
28
45
|
default="gigaspatial", description="User agent for HDX API requests"
|
29
46
|
)
|
30
47
|
hdx_site: str = Field(default="prod", description="HDX site to use (prod or test)")
|
31
|
-
resource_filter: Optional[Dict[str, Any]] = Field(
|
32
|
-
default=None, description="Filter to apply to resources"
|
33
|
-
)
|
34
|
-
|
35
|
-
@property
|
36
|
-
def output_dir_path(self) -> Path:
|
37
|
-
"""Path to save the downloaded HDX dataset"""
|
38
|
-
return self.base_path / self.dataset_name
|
39
|
-
|
40
|
-
def __repr__(self) -> str:
|
41
|
-
return (
|
42
|
-
f"HDXConfig(\n"
|
43
|
-
f" dataset_name='{self.dataset_name}'\n"
|
44
|
-
f" base_path='{self.base_path}'\n"
|
45
|
-
f" hdx_site='{self.hdx_site}'\n"
|
46
|
-
f" user_agent='{self.user_agent}'\n"
|
47
|
-
f")"
|
48
|
-
)
|
49
|
-
|
50
48
|
|
51
|
-
|
52
|
-
|
49
|
+
# Internal state
|
50
|
+
_hdx_configured: bool = Field(default=False, init=False)
|
51
|
+
dataset: Optional[Dataset] = Field(default=None, init=False)
|
53
52
|
|
54
|
-
def
|
55
|
-
|
56
|
-
config: Union[HDXConfig, dict],
|
57
|
-
data_store: Optional[DataStore] = None,
|
58
|
-
logger: Optional[logging.Logger] = None,
|
59
|
-
):
|
60
|
-
if isinstance(config, dict):
|
61
|
-
self.config = HDXConfig(**config)
|
62
|
-
else:
|
63
|
-
self.config = config
|
64
|
-
|
65
|
-
self.data_store = data_store or LocalDataStore()
|
66
|
-
self.logger = logger or global_config.get_logger(self.__class__.__name__)
|
53
|
+
def __post_init__(self):
|
54
|
+
super().__post_init__()
|
67
55
|
try:
|
68
56
|
Configuration.read()
|
69
57
|
self._hdx_configured = True
|
70
|
-
except:
|
58
|
+
except Exception:
|
71
59
|
self._hdx_configured = False
|
60
|
+
self.configure_hdx()
|
61
|
+
self.dataset = self.fetch_dataset()
|
72
62
|
|
73
|
-
@
|
74
|
-
def
|
75
|
-
"""
|
76
|
-
|
77
|
-
return cls(config=config)
|
63
|
+
@property
|
64
|
+
def output_dir_path(self) -> Path:
|
65
|
+
"""Path to save the downloaded HDX dataset"""
|
66
|
+
return self.base_path / self.dataset_name
|
78
67
|
|
79
|
-
def
|
68
|
+
def configure_hdx(self):
|
80
69
|
"""Configure HDX API if not already configured"""
|
81
70
|
if not self._hdx_configured:
|
82
71
|
try:
|
83
72
|
Configuration.create(
|
84
|
-
hdx_site=self.
|
85
|
-
user_agent=self.
|
73
|
+
hdx_site=self.hdx_site,
|
74
|
+
user_agent=self.user_agent,
|
86
75
|
hdx_read_only=True,
|
87
76
|
)
|
88
77
|
self._hdx_configured = True
|
@@ -90,40 +79,104 @@ class HDXDownloader:
|
|
90
79
|
self.logger.error(f"Error configuring HDX API: {str(e)}")
|
91
80
|
raise
|
92
81
|
|
93
|
-
def
|
82
|
+
def fetch_dataset(self) -> Dataset:
|
94
83
|
"""Get the HDX dataset"""
|
95
|
-
self._configure_hdx()
|
96
|
-
|
97
84
|
try:
|
98
|
-
self.logger.info(f"Fetching HDX dataset: {self.
|
99
|
-
dataset = Dataset.read_from_hdx(self.
|
85
|
+
self.logger.info(f"Fetching HDX dataset: {self.dataset_name}")
|
86
|
+
dataset = Dataset.read_from_hdx(self.dataset_name)
|
100
87
|
if not dataset:
|
101
|
-
raise ValueError(
|
102
|
-
f"Dataset '{self.config.dataset_name}' not found on HDX"
|
103
|
-
)
|
88
|
+
raise ValueError(f"Dataset '{self.dataset_name}' not found on HDX")
|
104
89
|
return dataset
|
105
90
|
except Exception as e:
|
106
91
|
self.logger.error(f"Error fetching HDX dataset: {str(e)}")
|
107
92
|
raise
|
108
93
|
|
94
|
+
def _match_pattern(self, value: str, pattern: str) -> bool:
|
95
|
+
"""Check if a value matches a pattern"""
|
96
|
+
if isinstance(pattern, str):
|
97
|
+
return pattern.lower() in value.lower()
|
98
|
+
return value == pattern
|
99
|
+
|
100
|
+
def _get_patterns_for_value(self, value: Any) -> List[str]:
|
101
|
+
"""Generate patterns for a given value or list of values"""
|
102
|
+
if isinstance(value, list):
|
103
|
+
patterns = []
|
104
|
+
for v in value:
|
105
|
+
patterns.extend(self._get_patterns_for_value(v))
|
106
|
+
return patterns
|
107
|
+
|
108
|
+
if not isinstance(value, str):
|
109
|
+
return [value]
|
110
|
+
|
111
|
+
patterns = []
|
112
|
+
value = value.lower()
|
113
|
+
|
114
|
+
# Add exact match
|
115
|
+
patterns.append(value)
|
116
|
+
|
117
|
+
# Add common variations
|
118
|
+
patterns.extend(
|
119
|
+
[
|
120
|
+
f"/{value}_", # URL path with prefix
|
121
|
+
f"/{value}.", # URL path with extension
|
122
|
+
f"_{value}_", # Filename with value in middle
|
123
|
+
f"_{value}.", # Filename with value at end
|
124
|
+
]
|
125
|
+
)
|
126
|
+
|
127
|
+
# If value contains spaces, generate additional patterns
|
128
|
+
if " " in value:
|
129
|
+
# Generate patterns for space-less version
|
130
|
+
no_space = value.replace(" ", "")
|
131
|
+
patterns.extend(self._get_patterns_for_value(no_space))
|
132
|
+
|
133
|
+
# Generate patterns for hyphenated version
|
134
|
+
hyphenated = value.replace(" ", "-")
|
135
|
+
patterns.extend(self._get_patterns_for_value(hyphenated))
|
136
|
+
|
137
|
+
return patterns
|
138
|
+
|
109
139
|
def get_dataset_resources(
|
110
|
-
self,
|
140
|
+
self, filter: Optional[Dict[str, Any]] = None, exact_match: bool = False
|
111
141
|
) -> List[Resource]:
|
112
|
-
"""Get resources from the HDX dataset
|
113
|
-
dataset = dataset or self.get_dataset()
|
142
|
+
"""Get resources from the HDX dataset
|
114
143
|
|
144
|
+
Args:
|
145
|
+
filter: Dictionary of key-value pairs to filter resources
|
146
|
+
exact_match: If True, perform exact matching. If False, use pattern matching
|
147
|
+
"""
|
115
148
|
try:
|
116
|
-
resources = dataset.get_resources()
|
149
|
+
resources = self.dataset.get_resources()
|
117
150
|
|
118
151
|
# Apply resource filter if specified
|
119
|
-
if
|
152
|
+
if filter:
|
120
153
|
filtered_resources = []
|
121
154
|
for res in resources:
|
122
155
|
match = True
|
123
|
-
for key, value in
|
124
|
-
if key in res.data
|
156
|
+
for key, value in filter.items():
|
157
|
+
if key not in res.data:
|
125
158
|
match = False
|
126
159
|
break
|
160
|
+
|
161
|
+
if exact_match:
|
162
|
+
# For exact matching, check if value matches or is in list of values
|
163
|
+
if isinstance(value, list):
|
164
|
+
if res.data[key] not in value:
|
165
|
+
match = False
|
166
|
+
break
|
167
|
+
elif res.data[key] != value:
|
168
|
+
match = False
|
169
|
+
break
|
170
|
+
else:
|
171
|
+
# For pattern matching, generate patterns for value(s)
|
172
|
+
patterns = self._get_patterns_for_value(value)
|
173
|
+
if not any(
|
174
|
+
self._match_pattern(str(res.data[key]), pattern)
|
175
|
+
for pattern in patterns
|
176
|
+
):
|
177
|
+
match = False
|
178
|
+
break
|
179
|
+
|
127
180
|
if match:
|
128
181
|
filtered_resources.append(res)
|
129
182
|
resources = filtered_resources
|
@@ -133,109 +186,325 @@ class HDXDownloader:
|
|
133
186
|
self.logger.error(f"Error getting dataset resources: {str(e)}")
|
134
187
|
raise
|
135
188
|
|
136
|
-
def
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
)
|
189
|
+
def get_relevant_data_units(
|
190
|
+
self, source: Union[str, Dict], **kwargs
|
191
|
+
) -> List[Resource]:
|
192
|
+
"""Get relevant data units based on the source type
|
193
|
+
|
194
|
+
Args:
|
195
|
+
source: Either a country name/code (str) or a filter dictionary
|
196
|
+
**kwargs: Additional keyword arguments passed to the specific method
|
197
|
+
|
198
|
+
Returns:
|
199
|
+
List of matching resources
|
200
|
+
"""
|
201
|
+
if isinstance(source, str):
|
202
|
+
# If source is a string, assume it's a country and use country-based filtering
|
203
|
+
return self.get_relevant_data_units_by_country(source, **kwargs)
|
204
|
+
elif isinstance(source, dict):
|
205
|
+
# If source is a dict, use it directly as a filter
|
206
|
+
return self.get_dataset_resources(filter=source, **kwargs)
|
207
|
+
else:
|
208
|
+
raise ValueError(f"Unsupported source type: {type(source)}")
|
151
209
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
with tempfile.TemporaryDirectory() as tmpdir:
|
160
|
-
url, local_path = res.download(folder=tmpdir)
|
161
|
-
# Read the file and write to the DataStore
|
162
|
-
with open(local_path, "rb") as f:
|
163
|
-
data = f.read()
|
164
|
-
# Compose the target path in the DataStore
|
165
|
-
target_path = str(
|
166
|
-
self.config.output_dir_path / Path(local_path).name
|
167
|
-
)
|
168
|
-
self.data_store.write_file(target_path, data)
|
169
|
-
downloaded_paths.append(target_path)
|
170
|
-
|
171
|
-
self.logger.info(
|
172
|
-
f"Downloaded resource: {resource_name} to {target_path}"
|
173
|
-
)
|
174
|
-
except Exception as e:
|
175
|
-
self.logger.error(
|
176
|
-
f"Error downloading resource {res.get('name', 'Unknown')}: {str(e)}"
|
177
|
-
)
|
178
|
-
|
179
|
-
return downloaded_paths
|
210
|
+
def get_relevant_data_units_by_geometry(
|
211
|
+
self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
|
212
|
+
) -> List[Resource]:
|
213
|
+
raise NotImplementedError(
|
214
|
+
"HDX does not support geometry-based filtering. "
|
215
|
+
"Please use country-based filtering or direct resource filtering instead."
|
216
|
+
)
|
180
217
|
|
181
|
-
|
182
|
-
|
183
|
-
|
218
|
+
def get_relevant_data_units_by_points(
|
219
|
+
self, points: List[Union[Point, tuple]], **kwargs
|
220
|
+
) -> List[Resource]:
|
221
|
+
raise NotImplementedError(
|
222
|
+
"HDX does not support point-based filtering. "
|
223
|
+
"Please use country-based filtering or direct resource filtering instead."
|
224
|
+
)
|
184
225
|
|
226
|
+
def get_relevant_data_units_by_country(
|
227
|
+
self,
|
228
|
+
country: str,
|
229
|
+
key: str = "url",
|
230
|
+
**kwargs,
|
231
|
+
) -> Any:
|
232
|
+
"""Get relevant data units for a country
|
233
|
+
|
234
|
+
Args:
|
235
|
+
country: Country name or code
|
236
|
+
key: The key to filter on in the resource data
|
237
|
+
patterns: List of patterns to match against the resource data
|
238
|
+
**kwargs: Additional keyword arguments
|
239
|
+
"""
|
240
|
+
country = pycountry.countries.lookup(country)
|
241
|
+
values = [country.alpha_3, country.alpha_2, country.name]
|
242
|
+
return self.get_dataset_resources(
|
243
|
+
filter={key: values},
|
244
|
+
)
|
185
245
|
|
186
|
-
|
187
|
-
|
246
|
+
def get_data_unit_path(self, unit: str, **kwargs) -> str:
|
247
|
+
"""Get the path for a data unit"""
|
248
|
+
try:
|
249
|
+
filename = unit.data["name"]
|
250
|
+
except:
|
251
|
+
filename = unit.get("download_url").split("/")[-1]
|
188
252
|
|
189
|
-
|
190
|
-
self,
|
191
|
-
dataset_name: str,
|
192
|
-
data_store: Optional[DataStore] = None,
|
193
|
-
base_path: Optional[Path] = None,
|
194
|
-
):
|
195
|
-
self.dataset_name = dataset_name
|
196
|
-
self.data_store = data_store or LocalDataStore()
|
197
|
-
self.base_path = base_path or global_config.get_path("hdx", "bronze")
|
198
|
-
self.dataset_path = self.base_path / self.dataset_name
|
253
|
+
return self.output_dir_path / filename
|
199
254
|
|
200
255
|
def list_resources(self) -> List[str]:
|
201
256
|
"""List all resources in the dataset directory using the data_store."""
|
257
|
+
dataset_folder = str(self.output_dir_path)
|
202
258
|
# Check if the dataset directory exists in the data_store
|
203
259
|
if not (
|
204
|
-
self.data_store.is_dir(
|
205
|
-
or self.data_store.file_exists(
|
260
|
+
self.data_store.is_dir(dataset_folder)
|
261
|
+
or self.data_store.file_exists(dataset_folder)
|
206
262
|
):
|
207
263
|
raise FileNotFoundError(
|
208
|
-
f"HDX dataset
|
264
|
+
f"HDX dataset not found at {dataset_folder}. "
|
209
265
|
"Download the data first using HDXDownloader."
|
210
266
|
)
|
211
|
-
|
212
|
-
|
267
|
+
return self.data_store.list_files(dataset_folder)
|
268
|
+
|
269
|
+
def __repr__(self) -> str:
|
270
|
+
return (
|
271
|
+
f"HDXConfig(\n"
|
272
|
+
f" dataset_name='{self.dataset_name}'\n"
|
273
|
+
f" base_path='{self.base_path}'\n"
|
274
|
+
f" hdx_site='{self.hdx_site}'\n"
|
275
|
+
f" user_agent='{self.user_agent}'\n"
|
276
|
+
f")"
|
277
|
+
)
|
213
278
|
|
214
|
-
def read_resource(
|
215
|
-
self, resource_file: str
|
216
|
-
) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
|
217
|
-
"""Read a specific resource file from the dataset using the data_store."""
|
218
|
-
file_path = str(self.dataset_path / resource_file)
|
219
279
|
|
220
|
-
|
221
|
-
|
222
|
-
f"Resource file {resource_file} not found in dataset {self.dataset_name}"
|
223
|
-
)
|
280
|
+
class HDXDownloader(BaseHandlerDownloader):
|
281
|
+
"""Downloader for HDX datasets"""
|
224
282
|
|
283
|
+
def __init__(
|
284
|
+
self,
|
285
|
+
config: Union[HDXConfig, dict],
|
286
|
+
data_store: Optional[DataStore] = None,
|
287
|
+
logger: Optional[logging.Logger] = None,
|
288
|
+
):
|
289
|
+
config = config if isinstance(config, HDXConfig) else HDXConfig(**config)
|
290
|
+
super().__init__(config=config, data_store=data_store, logger=logger)
|
291
|
+
|
292
|
+
def download_data_unit(self, resource: str, **kwargs) -> str:
|
293
|
+
"""Download a single resource"""
|
225
294
|
try:
|
226
|
-
|
295
|
+
resource_name = resource.get("name", "Unknown")
|
296
|
+
self.logger.info(f"Downloading resource: {resource_name}")
|
297
|
+
|
298
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
299
|
+
url, local_path = resource.download(folder=tmpdir)
|
300
|
+
with open(local_path, "rb") as f:
|
301
|
+
data = f.read()
|
302
|
+
# Compose the target path in the DataStore
|
303
|
+
target_path = str(self.config.get_data_unit_path(resource))
|
304
|
+
self.data_store.write_file(target_path, data)
|
305
|
+
self.logger.info(
|
306
|
+
f"Downloaded resource: {resource_name} to {target_path}"
|
307
|
+
)
|
308
|
+
return target_path
|
227
309
|
except Exception as e:
|
228
|
-
|
310
|
+
self.logger.error(f"Error downloading resource {resource_name}: {str(e)}")
|
311
|
+
return None
|
312
|
+
|
313
|
+
def download_data_units(self, resources: List[Resource], **kwargs) -> List[str]:
|
314
|
+
"""Download multiple resources sequentially
|
315
|
+
|
316
|
+
Args:
|
317
|
+
resources: List of HDX Resource objects
|
318
|
+
**kwargs: Additional keyword arguments
|
319
|
+
|
320
|
+
Returns:
|
321
|
+
List of paths to downloaded files
|
322
|
+
"""
|
323
|
+
if len(resources) == 0:
|
324
|
+
self.logger.warning("There is no resource to download")
|
325
|
+
return []
|
229
326
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
327
|
+
downloaded_paths = []
|
328
|
+
for resource in tqdm(resources, desc="Downloading resources"):
|
329
|
+
path = self.download_data_unit(resource)
|
330
|
+
if path:
|
331
|
+
downloaded_paths.append(path)
|
234
332
|
|
235
|
-
|
333
|
+
return downloaded_paths
|
334
|
+
|
335
|
+
def download(self, source: Union[Dict, str], **kwargs) -> List[str]:
|
336
|
+
"""Download data for a source"""
|
337
|
+
resources = self.config.get_relevant_data_units(source, **kwargs)
|
338
|
+
return self.download_data_units(resources)
|
339
|
+
|
340
|
+
|
341
|
+
class HDXReader(BaseHandlerReader):
|
342
|
+
"""Reader for HDX datasets"""
|
343
|
+
|
344
|
+
def __init__(
|
345
|
+
self,
|
346
|
+
config: Optional[HDXConfig] = None,
|
347
|
+
data_store: Optional[DataStore] = None,
|
348
|
+
logger: Optional[logging.Logger] = None,
|
349
|
+
):
|
350
|
+
config = config if isinstance(config, HDXConfig) else HDXConfig(**config)
|
351
|
+
super().__init__(config=config, data_store=data_store, logger=logger)
|
352
|
+
|
353
|
+
def resolve_source_paths(
|
354
|
+
self,
|
355
|
+
source: Union[
|
356
|
+
str, # country code
|
357
|
+
Dict, # filter
|
358
|
+
Path, # path
|
359
|
+
str, # path
|
360
|
+
List[Union[str, Path]],
|
361
|
+
],
|
362
|
+
**kwargs,
|
363
|
+
) -> List[Union[str, Path]]:
|
364
|
+
if isinstance(source, (str, Path)):
|
365
|
+
# Could be a country code or a path
|
366
|
+
if self.data_store.file_exists(str(source)) or str(source).endswith(
|
367
|
+
(".csv", ".tif", ".json", ".parquet", ".gz", ".geojson", ".zip")
|
368
|
+
):
|
369
|
+
source_data_paths = self.resolve_by_paths(source)
|
370
|
+
else:
|
371
|
+
source_data_paths = self.resolve_by_country(source, **kwargs)
|
372
|
+
elif isinstance(source, Dict):
|
373
|
+
resources = self.config.get_relevant_data_units(source=source, **kwargs)
|
374
|
+
source_data_paths = self.config.get_data_unit_paths(resources, **kwargs)
|
375
|
+
elif isinstance(source, Iterable) and all(
|
376
|
+
isinstance(p, (str, Path)) for p in source
|
377
|
+
):
|
378
|
+
source_data_paths = self.resolve_by_paths(source)
|
379
|
+
else:
|
380
|
+
raise NotImplementedError(f"Unsupported source type: {type(source)}")
|
381
|
+
|
382
|
+
self.logger.info(f"Resolved {len(source_data_paths)} paths!")
|
383
|
+
return source_data_paths
|
384
|
+
|
385
|
+
def load_from_paths(
|
386
|
+
self, source_data_path: List[Union[str, Path]], **kwargs
|
387
|
+
) -> Any:
|
388
|
+
"""Load data from paths"""
|
389
|
+
if len(source_data_path)==1:
|
390
|
+
return read_dataset(self.data_store, source_data_path[0])
|
391
|
+
|
392
|
+
all_data = {}
|
393
|
+
for file_path in source_data_path:
|
236
394
|
try:
|
237
|
-
|
395
|
+
all_data[file_path] = read_dataset(self.data_store, file_path)
|
238
396
|
except Exception as e:
|
239
|
-
|
397
|
+
raise ValueError(f"Could not read file {file_path}: {str(e)}")
|
398
|
+
return all_data
|
399
|
+
|
400
|
+
def load_all_resources(self):
|
401
|
+
resources = self.config.list_resources()
|
402
|
+
return self.load_from_paths(resources)
|
403
|
+
|
404
|
+
# def read_resource(
|
405
|
+
# self, resource_file: str
|
406
|
+
# ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
|
407
|
+
# """Read a specific resource file from the dataset using the data_store."""
|
408
|
+
# if not self.dataset_path:
|
409
|
+
# raise ValueError("No dataset path configured")
|
410
|
+
|
411
|
+
# file_path = str(self.dataset_path / resource_file)
|
412
|
+
|
413
|
+
# if not self.data_store.file_exists(file_path):
|
414
|
+
# raise FileNotFoundError(
|
415
|
+
# f"Resource file {resource_file} not found in dataset"
|
416
|
+
# )
|
417
|
+
|
418
|
+
# try:
|
419
|
+
# return read_dataset(self.data_store, file_path)
|
420
|
+
# except Exception as e:
|
421
|
+
# raise ValueError(f"Could not read file {file_path}: {str(e)}")
|
422
|
+
|
423
|
+
# def read_all_resources(self) -> Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame]]:
|
424
|
+
# """Read all resources in the dataset directory using the data_store."""
|
425
|
+
# resources = self.list_resources()
|
426
|
+
# result = {}
|
427
|
+
|
428
|
+
# for resource in resources:
|
429
|
+
# try:
|
430
|
+
# result[resource] = self.read_resource(resource)
|
431
|
+
# except Exception as e:
|
432
|
+
# self.logger.warning(f"Could not read resource {resource}: {str(e)}")
|
433
|
+
|
434
|
+
# return result
|
435
|
+
|
436
|
+
# def load_from_paths(
|
437
|
+
# self, source_data_path: List[Union[str, Path]], **kwargs
|
438
|
+
# ) -> Union[
|
439
|
+
# pd.DataFrame, gpd.GeoDataFrame, Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame]]
|
440
|
+
# ]:
|
441
|
+
# """Load data from paths"""
|
442
|
+
# if len(source_data_path) == 1:
|
443
|
+
# return self.read_resource(str(source_data_path[0]))
|
444
|
+
# else:
|
445
|
+
# return self.read_all_resources()
|
446
|
+
|
447
|
+
|
448
|
+
class HDXHandler(BaseHandler):
|
449
|
+
"""Handler for HDX datasets"""
|
240
450
|
|
241
|
-
|
451
|
+
def __init__(
|
452
|
+
self,
|
453
|
+
dataset_name: str,
|
454
|
+
config: Optional[HDXConfig] = None,
|
455
|
+
downloader: Optional[HDXDownloader] = None,
|
456
|
+
reader: Optional[HDXReader] = None,
|
457
|
+
data_store: Optional[DataStore] = None,
|
458
|
+
logger: Optional[logging.Logger] = None,
|
459
|
+
**kwargs,
|
460
|
+
):
|
461
|
+
self._dataset_name = dataset_name
|
462
|
+
super().__init__(
|
463
|
+
config=config,
|
464
|
+
downloader=downloader,
|
465
|
+
reader=reader,
|
466
|
+
data_store=data_store,
|
467
|
+
logger=logger,
|
468
|
+
**kwargs,
|
469
|
+
)
|
470
|
+
|
471
|
+
def create_config(
|
472
|
+
self, data_store: DataStore, logger: logging.Logger, **kwargs
|
473
|
+
) -> HDXConfig:
|
474
|
+
"""Create and return a HDXConfig instance"""
|
475
|
+
return HDXConfig(
|
476
|
+
dataset_name=self._dataset_name,
|
477
|
+
data_store=data_store,
|
478
|
+
logger=logger,
|
479
|
+
**kwargs,
|
480
|
+
)
|
481
|
+
|
482
|
+
def create_downloader(
|
483
|
+
self,
|
484
|
+
config: HDXConfig,
|
485
|
+
data_store: DataStore,
|
486
|
+
logger: logging.Logger,
|
487
|
+
**kwargs,
|
488
|
+
) -> HDXDownloader:
|
489
|
+
"""Create and return a HDXDownloader instance"""
|
490
|
+
return HDXDownloader(
|
491
|
+
config=config,
|
492
|
+
data_store=data_store,
|
493
|
+
logger=logger,
|
494
|
+
**kwargs,
|
495
|
+
)
|
496
|
+
|
497
|
+
def create_reader(
|
498
|
+
self,
|
499
|
+
config: HDXConfig,
|
500
|
+
data_store: DataStore,
|
501
|
+
logger: logging.Logger,
|
502
|
+
**kwargs,
|
503
|
+
) -> HDXReader:
|
504
|
+
"""Create and return a HDXReader instance"""
|
505
|
+
return HDXReader(
|
506
|
+
config=config,
|
507
|
+
data_store=data_store,
|
508
|
+
logger=logger,
|
509
|
+
**kwargs,
|
510
|
+
)
|
gigaspatial/handlers/rwi.py
CHANGED
@@ -1,15 +1,17 @@
|
|
1
1
|
import logging
|
2
|
-
from
|
3
|
-
from
|
4
|
-
import
|
5
|
-
import tempfile
|
2
|
+
from typing import List, Optional, Union, Literal
|
3
|
+
from pydantic.dataclasses import dataclass
|
4
|
+
from datetime import datetime
|
6
5
|
|
7
|
-
from
|
6
|
+
from hdx.data.resource import Resource
|
7
|
+
|
8
|
+
from pydantic import Field, ConfigDict
|
8
9
|
|
9
10
|
from gigaspatial.core.io.data_store import DataStore
|
10
|
-
from gigaspatial.handlers.hdx import HDXConfig, HDXDownloader
|
11
|
+
from gigaspatial.handlers.hdx import HDXConfig, HDXDownloader, HDXReader, HDXHandler
|
11
12
|
|
12
13
|
|
14
|
+
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
13
15
|
class RWIConfig(HDXConfig):
|
14
16
|
"""Configuration for Relative Wealth Index data access"""
|
15
17
|
|
@@ -22,16 +24,49 @@ class RWIConfig(HDXConfig):
|
|
22
24
|
country: Optional[str] = Field(
|
23
25
|
default=None, description="Country ISO code to filter data for"
|
24
26
|
)
|
27
|
+
latest_only: bool = Field(
|
28
|
+
default=True,
|
29
|
+
description="If True, only get the latest resource for each country",
|
30
|
+
)
|
31
|
+
|
32
|
+
def __post_init__(self):
|
33
|
+
super().__post_init__()
|
34
|
+
|
35
|
+
def get_relevant_data_units_by_country(
|
36
|
+
self, country: str, **kwargs
|
37
|
+
) -> List[Resource]:
|
38
|
+
"""Get relevant data units for a country, optionally filtering for latest version"""
|
39
|
+
resources = super().get_relevant_data_units_by_country(
|
40
|
+
country=country, key="url"
|
41
|
+
)
|
42
|
+
|
43
|
+
if self.latest_only and len(resources) > 1:
|
44
|
+
# Find the resource with the latest creation date
|
45
|
+
latest_resource = None
|
46
|
+
latest_date = None
|
47
|
+
|
48
|
+
for resource in resources:
|
49
|
+
created = resource.get("created")
|
50
|
+
if created:
|
51
|
+
try:
|
52
|
+
created_dt = datetime.fromisoformat(
|
53
|
+
created.replace("Z", "+00:00")
|
54
|
+
)
|
55
|
+
if latest_date is None or created_dt > latest_date:
|
56
|
+
latest_date = created_dt
|
57
|
+
latest_resource = resource
|
58
|
+
except ValueError:
|
59
|
+
self.logger.warning(
|
60
|
+
f"Could not parse creation date for resource: {created}"
|
61
|
+
)
|
62
|
+
|
63
|
+
if latest_resource:
|
64
|
+
resources = [latest_resource]
|
25
65
|
|
26
|
-
|
27
|
-
def validate_country(cls, value: str) -> str:
|
28
|
-
try:
|
29
|
-
return pycountry.countries.lookup(value).alpha_3
|
30
|
-
except LookupError:
|
31
|
-
raise ValueError(f"Invalid country code provided: {value}")
|
66
|
+
return resources
|
32
67
|
|
33
68
|
|
34
|
-
class
|
69
|
+
class RWIDownloader(HDXDownloader):
|
35
70
|
"""Specialized downloader for the Relative Wealth Index dataset from HDX"""
|
36
71
|
|
37
72
|
def __init__(
|
@@ -40,118 +75,81 @@ class RelativeWealthIndexDownloader(HDXDownloader):
|
|
40
75
|
data_store: Optional[DataStore] = None,
|
41
76
|
logger: Optional[logging.Logger] = None,
|
42
77
|
):
|
43
|
-
if config
|
44
|
-
config = RWIConfig()
|
45
|
-
elif isinstance(config, dict):
|
46
|
-
config = RWIConfig(**config)
|
47
|
-
|
78
|
+
config = config if isinstance(config, RWIConfig) else RWIConfig(**config)
|
48
79
|
super().__init__(config=config, data_store=data_store, logger=logger)
|
49
80
|
|
50
|
-
@classmethod
|
51
|
-
def from_config(
|
52
|
-
cls,
|
53
|
-
country: Optional[str] = None,
|
54
|
-
**kwargs,
|
55
|
-
):
|
56
|
-
"""Create a downloader with RWI-specific configurations"""
|
57
|
-
config = RWIConfig(country=country, **kwargs)
|
58
|
-
return cls(config=config)
|
59
|
-
|
60
|
-
def download_dataset(self) -> List[str]:
|
61
|
-
"""Download RWI dataset, optionally filtering for a specific country.
|
62
|
-
|
63
|
-
If country is specified, attempts to find and download only the resources
|
64
|
-
relevant to that country. Otherwise, downloads all RWI resources.
|
65
|
-
|
66
|
-
Returns:
|
67
|
-
List of paths to the downloaded files
|
68
|
-
"""
|
69
|
-
# If no country specified, download all resources
|
70
|
-
if self.config.country is None:
|
71
|
-
return super().download_dataset()
|
72
|
-
|
73
|
-
# Get all resources from the dataset
|
74
|
-
try:
|
75
|
-
resources = self.get_dataset_resources()
|
76
|
-
if not resources:
|
77
|
-
self.logger.warning(f"No resources found for RWI dataset")
|
78
|
-
return []
|
79
|
-
|
80
|
-
# Prepare country identifiers for matching
|
81
|
-
country_code = self.config.country.lower()
|
82
|
-
country_name = pycountry.countries.lookup(self.config.country).name.lower()
|
83
|
-
country_alpha2 = pycountry.countries.lookup(
|
84
|
-
self.config.country
|
85
|
-
).alpha_2.lower()
|
86
|
-
|
87
|
-
# Try different matching patterns
|
88
|
-
country_patterns = [
|
89
|
-
f"/{country_code}_", # URL path with ISO3 prefix
|
90
|
-
f"/{country_code}.", # URL path with ISO3 followed by extension
|
91
|
-
f"_{country_code}_", # Filename with ISO3 in middle
|
92
|
-
f"_{country_code}.", # Filename with ISO3 at end
|
93
|
-
f"/{country_name.replace(' ', '')}_", # URL with no spaces
|
94
|
-
f"/{country_name.replace(' ', '-')}_", # URL with hyphens
|
95
|
-
f"/{country_alpha2}_", # URL with ISO2 code
|
96
|
-
country_name, # Country name anywhere in URL
|
97
|
-
]
|
98
|
-
|
99
|
-
# Find matching resources
|
100
|
-
matching_resources = []
|
101
|
-
for resource in resources:
|
102
|
-
# Get the URL safely
|
103
|
-
resource_url = resource.get("url", "")
|
104
|
-
if not resource_url:
|
105
|
-
continue
|
106
|
-
|
107
|
-
resource_url = resource_url.lower()
|
108
|
-
|
109
|
-
# Check for matches with our patterns
|
110
|
-
if any(pattern in resource_url for pattern in country_patterns):
|
111
|
-
matching_resources.append(resource)
|
112
|
-
|
113
|
-
if not matching_resources:
|
114
|
-
self.logger.warning(
|
115
|
-
f"No resources matching country '{self.config.country}' were found. "
|
116
|
-
f"Consider downloading the full dataset with country=None and filtering afterwards."
|
117
|
-
)
|
118
|
-
return []
|
119
|
-
|
120
|
-
# Download the matching resources
|
121
|
-
downloaded_paths = []
|
122
|
-
for res in matching_resources:
|
123
|
-
try:
|
124
|
-
resource_name = res.get("name", "Unknown")
|
125
|
-
self.logger.info(f"Downloading resource: {resource_name}")
|
126
|
-
|
127
|
-
# Download to a temporary directory
|
128
|
-
with tempfile.TemporaryDirectory() as tmpdir:
|
129
|
-
url, local_path = res.download(folder=tmpdir)
|
130
|
-
# Read the file and write to the DataStore
|
131
|
-
with open(local_path, "rb") as f:
|
132
|
-
data = f.read()
|
133
|
-
# Compose the target path in the DataStore
|
134
|
-
target_path = str(
|
135
|
-
self.config.output_dir_path / Path(local_path).name
|
136
|
-
)
|
137
|
-
self.data_store.write_file(target_path, data)
|
138
|
-
downloaded_paths.append(target_path)
|
139
81
|
|
140
|
-
|
141
|
-
|
142
|
-
)
|
82
|
+
class RWIReader(HDXReader):
|
83
|
+
"""Specialized reader for the Relative Wealth Index dataset from HDX"""
|
143
84
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
85
|
+
def __init__(
|
86
|
+
self,
|
87
|
+
config: Union[RWIConfig, dict] = None,
|
88
|
+
data_store: Optional[DataStore] = None,
|
89
|
+
logger: Optional[logging.Logger] = None,
|
90
|
+
):
|
91
|
+
config = config if isinstance(config, RWIConfig) else RWIConfig(**config)
|
92
|
+
super().__init__(config=config, data_store=data_store, logger=logger)
|
149
93
|
|
150
|
-
return downloaded_paths
|
151
94
|
|
152
|
-
|
153
|
-
|
95
|
+
class RWIHandler(HDXHandler):
|
96
|
+
"""Handler for Relative Wealth Index dataset"""
|
154
97
|
|
155
|
-
|
156
|
-
|
157
|
-
|
98
|
+
def __init__(
|
99
|
+
self,
|
100
|
+
config: Optional[RWIConfig] = None,
|
101
|
+
downloader: Optional[RWIDownloader] = None,
|
102
|
+
reader: Optional[RWIReader] = None,
|
103
|
+
data_store: Optional[DataStore] = None,
|
104
|
+
logger: Optional[logging.Logger] = None,
|
105
|
+
**kwargs,
|
106
|
+
):
|
107
|
+
super().__init__(
|
108
|
+
dataset_name="relative-wealth-index",
|
109
|
+
config=config,
|
110
|
+
downloader=downloader,
|
111
|
+
reader=reader,
|
112
|
+
data_store=data_store,
|
113
|
+
logger=logger,
|
114
|
+
**kwargs,
|
115
|
+
)
|
116
|
+
|
117
|
+
def create_config(
|
118
|
+
self, data_store: DataStore, logger: logging.Logger, **kwargs
|
119
|
+
) -> RWIConfig:
|
120
|
+
"""Create and return a RWIConfig instance"""
|
121
|
+
return RWIConfig(
|
122
|
+
data_store=data_store,
|
123
|
+
logger=logger,
|
124
|
+
**kwargs,
|
125
|
+
)
|
126
|
+
|
127
|
+
def create_downloader(
|
128
|
+
self,
|
129
|
+
config: RWIConfig,
|
130
|
+
data_store: DataStore,
|
131
|
+
logger: logging.Logger,
|
132
|
+
**kwargs,
|
133
|
+
) -> RWIDownloader:
|
134
|
+
"""Create and return a RWIDownloader instance"""
|
135
|
+
return RWIDownloader(
|
136
|
+
config=config,
|
137
|
+
data_store=data_store,
|
138
|
+
logger=logger,
|
139
|
+
**kwargs,
|
140
|
+
)
|
141
|
+
|
142
|
+
def create_reader(
|
143
|
+
self,
|
144
|
+
config: RWIConfig,
|
145
|
+
data_store: DataStore,
|
146
|
+
logger: logging.Logger,
|
147
|
+
**kwargs,
|
148
|
+
) -> RWIReader:
|
149
|
+
"""Create and return a RWIReader instance"""
|
150
|
+
return RWIReader(
|
151
|
+
config=config,
|
152
|
+
data_store=data_store,
|
153
|
+
logger=logger,
|
154
|
+
**kwargs,
|
155
|
+
)
|
@@ -18,12 +18,12 @@ from gigaspatial.config import config
|
|
18
18
|
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
19
19
|
class TifProcessor:
|
20
20
|
"""
|
21
|
-
A class to handle tif data processing, supporting single-band, RGB, and
|
21
|
+
A class to handle tif data processing, supporting single-band, RGB, RGBA, and multi-band data.
|
22
22
|
"""
|
23
23
|
|
24
24
|
dataset_path: Union[Path, str]
|
25
25
|
data_store: Optional[DataStore] = None
|
26
|
-
mode: Literal["single", "rgb", "rgba"] = "single"
|
26
|
+
mode: Literal["single", "rgb", "rgba", "multi"] = "single"
|
27
27
|
|
28
28
|
def __post_init__(self):
|
29
29
|
"""Validate inputs and set up logging."""
|
@@ -36,10 +36,15 @@ class TifProcessor:
|
|
36
36
|
|
37
37
|
self._load_metadata()
|
38
38
|
|
39
|
+
# Validate mode and band count
|
39
40
|
if self.mode == "rgba" and self.count != 4:
|
40
41
|
raise ValueError("RGBA mode requires a 4-band TIF file")
|
41
42
|
if self.mode == "rgb" and self.count != 3:
|
42
43
|
raise ValueError("RGB mode requires a 3-band TIF file")
|
44
|
+
if self.mode == "single" and self.count != 1:
|
45
|
+
raise ValueError("Single mode requires a 1-band TIF file")
|
46
|
+
if self.mode == "multi" and self.count < 2:
|
47
|
+
raise ValueError("Multi mode requires a TIF file with 2 or more bands")
|
43
48
|
|
44
49
|
@contextmanager
|
45
50
|
def open_dataset(self):
|
@@ -118,6 +123,16 @@ class TifProcessor:
|
|
118
123
|
self._tabular = self._to_rgb_dataframe(drop_nodata=True)
|
119
124
|
elif self.mode == "rgba":
|
120
125
|
self._tabular = self._to_rgba_dataframe(drop_transparent=True)
|
126
|
+
elif self.mode == "multi":
|
127
|
+
self._tabular = self._to_multi_band_dataframe(
|
128
|
+
drop_nodata=True,
|
129
|
+
drop_values=[],
|
130
|
+
band_names=None, # Use default band naming
|
131
|
+
)
|
132
|
+
else:
|
133
|
+
raise ValueError(
|
134
|
+
f"Invalid mode: {self.mode}. Must be one of: single, rgb, rgba, multi"
|
135
|
+
)
|
121
136
|
except Exception as e:
|
122
137
|
raise ValueError(
|
123
138
|
f"Failed to process TIF file in mode '{self.mode}'. "
|
@@ -393,6 +408,77 @@ class TifProcessor:
|
|
393
408
|
self.logger.info("Dataset is processed!")
|
394
409
|
return data
|
395
410
|
|
411
|
+
def _to_multi_band_dataframe(
|
412
|
+
self,
|
413
|
+
drop_nodata: bool = True,
|
414
|
+
drop_values: list = [],
|
415
|
+
band_names: Optional[List[str]] = None,
|
416
|
+
) -> pd.DataFrame:
|
417
|
+
"""
|
418
|
+
Process multi-band TIF to DataFrame with all bands included.
|
419
|
+
|
420
|
+
Args:
|
421
|
+
drop_nodata (bool): Whether to drop nodata values. Defaults to True.
|
422
|
+
drop_values (list): Additional values to drop from the dataset. Defaults to empty list.
|
423
|
+
band_names (Optional[List[str]]): Custom names for the bands. If None, bands will be named using
|
424
|
+
the band descriptions from the GeoTIFF metadata if available,
|
425
|
+
otherwise 'band_1', 'band_2', etc.
|
426
|
+
|
427
|
+
Returns:
|
428
|
+
pd.DataFrame: DataFrame containing coordinates and all band values
|
429
|
+
"""
|
430
|
+
self.logger.info("Processing multi-band dataset...")
|
431
|
+
|
432
|
+
with self.open_dataset() as src:
|
433
|
+
# Read all bands
|
434
|
+
stack = src.read()
|
435
|
+
|
436
|
+
x_coords, y_coords = self._get_pixel_coordinates()
|
437
|
+
|
438
|
+
# Initialize dictionary with coordinates
|
439
|
+
data_dict = {"lon": x_coords.flatten(), "lat": y_coords.flatten()}
|
440
|
+
|
441
|
+
# Get band descriptions from metadata if available
|
442
|
+
if band_names is None and hasattr(src, "descriptions") and src.descriptions:
|
443
|
+
band_names = [
|
444
|
+
desc if desc else f"band_{i+1}"
|
445
|
+
for i, desc in enumerate(src.descriptions)
|
446
|
+
]
|
447
|
+
|
448
|
+
# Process each band
|
449
|
+
for band_idx in range(self.count):
|
450
|
+
band_data = stack[band_idx]
|
451
|
+
|
452
|
+
# Handle nodata and other values to drop
|
453
|
+
if drop_nodata or drop_values:
|
454
|
+
values_to_mask = []
|
455
|
+
if drop_nodata and src.nodata is not None:
|
456
|
+
values_to_mask.append(src.nodata)
|
457
|
+
if drop_values:
|
458
|
+
values_to_mask.extend(drop_values)
|
459
|
+
|
460
|
+
if values_to_mask:
|
461
|
+
data_mask = ~np.isin(band_data, values_to_mask)
|
462
|
+
band_values = np.extract(data_mask, band_data)
|
463
|
+
if band_idx == 0: # Only need to mask coordinates once
|
464
|
+
data_dict["lon"] = np.extract(data_mask, x_coords)
|
465
|
+
data_dict["lat"] = np.extract(data_mask, y_coords)
|
466
|
+
else:
|
467
|
+
band_values = band_data.flatten()
|
468
|
+
else:
|
469
|
+
band_values = band_data.flatten()
|
470
|
+
|
471
|
+
# Use custom band names if provided, otherwise use descriptions or default naming
|
472
|
+
band_name = (
|
473
|
+
band_names[band_idx]
|
474
|
+
if band_names and len(band_names) > band_idx
|
475
|
+
else f"band_{band_idx + 1}"
|
476
|
+
)
|
477
|
+
data_dict[band_name] = band_values
|
478
|
+
|
479
|
+
self.logger.info("Multi-band dataset is processed!")
|
480
|
+
return pd.DataFrame(data_dict)
|
481
|
+
|
396
482
|
def _get_pixel_coordinates(self):
|
397
483
|
"""Helper method to generate coordinate arrays for all pixels"""
|
398
484
|
if "pixel_coords" not in self._cache:
|
File without changes
|
File without changes
|
File without changes
|