giga-spatial 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/METADATA +18 -8
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/RECORD +15 -15
- gigaspatial/__init__.py +1 -1
- gigaspatial/config.py +6 -0
- gigaspatial/handlers/__init__.py +7 -3
- gigaspatial/handlers/boundaries.py +196 -43
- gigaspatial/handlers/ghsl.py +7 -6
- gigaspatial/handlers/giga.py +641 -0
- gigaspatial/handlers/hdx.py +411 -143
- gigaspatial/handlers/maxar_image.py +1 -2
- gigaspatial/handlers/rwi.py +119 -121
- gigaspatial/processing/tif_processor.py +88 -2
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/WHEEL +0 -0
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/licenses/LICENSE +0 -0
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/top_level.txt +0 -0
gigaspatial/handlers/rwi.py
CHANGED
@@ -1,15 +1,17 @@
|
|
1
1
|
import logging
|
2
|
-
from
|
3
|
-
from
|
4
|
-
import
|
5
|
-
import tempfile
|
2
|
+
from typing import List, Optional, Union, Literal
|
3
|
+
from pydantic.dataclasses import dataclass
|
4
|
+
from datetime import datetime
|
6
5
|
|
7
|
-
from
|
6
|
+
from hdx.data.resource import Resource
|
7
|
+
|
8
|
+
from pydantic import Field, ConfigDict
|
8
9
|
|
9
10
|
from gigaspatial.core.io.data_store import DataStore
|
10
|
-
from gigaspatial.handlers.hdx import HDXConfig, HDXDownloader
|
11
|
+
from gigaspatial.handlers.hdx import HDXConfig, HDXDownloader, HDXReader, HDXHandler
|
11
12
|
|
12
13
|
|
14
|
+
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
13
15
|
class RWIConfig(HDXConfig):
|
14
16
|
"""Configuration for Relative Wealth Index data access"""
|
15
17
|
|
@@ -22,16 +24,49 @@ class RWIConfig(HDXConfig):
|
|
22
24
|
country: Optional[str] = Field(
|
23
25
|
default=None, description="Country ISO code to filter data for"
|
24
26
|
)
|
27
|
+
latest_only: bool = Field(
|
28
|
+
default=True,
|
29
|
+
description="If True, only get the latest resource for each country",
|
30
|
+
)
|
31
|
+
|
32
|
+
def __post_init__(self):
|
33
|
+
super().__post_init__()
|
34
|
+
|
35
|
+
def get_relevant_data_units_by_country(
|
36
|
+
self, country: str, **kwargs
|
37
|
+
) -> List[Resource]:
|
38
|
+
"""Get relevant data units for a country, optionally filtering for latest version"""
|
39
|
+
resources = super().get_relevant_data_units_by_country(
|
40
|
+
country=country, key="url"
|
41
|
+
)
|
42
|
+
|
43
|
+
if self.latest_only and len(resources) > 1:
|
44
|
+
# Find the resource with the latest creation date
|
45
|
+
latest_resource = None
|
46
|
+
latest_date = None
|
47
|
+
|
48
|
+
for resource in resources:
|
49
|
+
created = resource.get("created")
|
50
|
+
if created:
|
51
|
+
try:
|
52
|
+
created_dt = datetime.fromisoformat(
|
53
|
+
created.replace("Z", "+00:00")
|
54
|
+
)
|
55
|
+
if latest_date is None or created_dt > latest_date:
|
56
|
+
latest_date = created_dt
|
57
|
+
latest_resource = resource
|
58
|
+
except ValueError:
|
59
|
+
self.logger.warning(
|
60
|
+
f"Could not parse creation date for resource: {created}"
|
61
|
+
)
|
62
|
+
|
63
|
+
if latest_resource:
|
64
|
+
resources = [latest_resource]
|
25
65
|
|
26
|
-
|
27
|
-
def validate_country(cls, value: str) -> str:
|
28
|
-
try:
|
29
|
-
return pycountry.countries.lookup(value).alpha_3
|
30
|
-
except LookupError:
|
31
|
-
raise ValueError(f"Invalid country code provided: {value}")
|
66
|
+
return resources
|
32
67
|
|
33
68
|
|
34
|
-
class
|
69
|
+
class RWIDownloader(HDXDownloader):
|
35
70
|
"""Specialized downloader for the Relative Wealth Index dataset from HDX"""
|
36
71
|
|
37
72
|
def __init__(
|
@@ -40,118 +75,81 @@ class RelativeWealthIndexDownloader(HDXDownloader):
|
|
40
75
|
data_store: Optional[DataStore] = None,
|
41
76
|
logger: Optional[logging.Logger] = None,
|
42
77
|
):
|
43
|
-
if config
|
44
|
-
config = RWIConfig()
|
45
|
-
elif isinstance(config, dict):
|
46
|
-
config = RWIConfig(**config)
|
47
|
-
|
78
|
+
config = config if isinstance(config, RWIConfig) else RWIConfig(**config)
|
48
79
|
super().__init__(config=config, data_store=data_store, logger=logger)
|
49
80
|
|
50
|
-
@classmethod
|
51
|
-
def from_config(
|
52
|
-
cls,
|
53
|
-
country: Optional[str] = None,
|
54
|
-
**kwargs,
|
55
|
-
):
|
56
|
-
"""Create a downloader with RWI-specific configurations"""
|
57
|
-
config = RWIConfig(country=country, **kwargs)
|
58
|
-
return cls(config=config)
|
59
|
-
|
60
|
-
def download_dataset(self) -> List[str]:
|
61
|
-
"""Download RWI dataset, optionally filtering for a specific country.
|
62
|
-
|
63
|
-
If country is specified, attempts to find and download only the resources
|
64
|
-
relevant to that country. Otherwise, downloads all RWI resources.
|
65
|
-
|
66
|
-
Returns:
|
67
|
-
List of paths to the downloaded files
|
68
|
-
"""
|
69
|
-
# If no country specified, download all resources
|
70
|
-
if self.config.country is None:
|
71
|
-
return super().download_dataset()
|
72
|
-
|
73
|
-
# Get all resources from the dataset
|
74
|
-
try:
|
75
|
-
resources = self.get_dataset_resources()
|
76
|
-
if not resources:
|
77
|
-
self.logger.warning(f"No resources found for RWI dataset")
|
78
|
-
return []
|
79
|
-
|
80
|
-
# Prepare country identifiers for matching
|
81
|
-
country_code = self.config.country.lower()
|
82
|
-
country_name = pycountry.countries.lookup(self.config.country).name.lower()
|
83
|
-
country_alpha2 = pycountry.countries.lookup(
|
84
|
-
self.config.country
|
85
|
-
).alpha_2.lower()
|
86
|
-
|
87
|
-
# Try different matching patterns
|
88
|
-
country_patterns = [
|
89
|
-
f"/{country_code}_", # URL path with ISO3 prefix
|
90
|
-
f"/{country_code}.", # URL path with ISO3 followed by extension
|
91
|
-
f"_{country_code}_", # Filename with ISO3 in middle
|
92
|
-
f"_{country_code}.", # Filename with ISO3 at end
|
93
|
-
f"/{country_name.replace(' ', '')}_", # URL with no spaces
|
94
|
-
f"/{country_name.replace(' ', '-')}_", # URL with hyphens
|
95
|
-
f"/{country_alpha2}_", # URL with ISO2 code
|
96
|
-
country_name, # Country name anywhere in URL
|
97
|
-
]
|
98
|
-
|
99
|
-
# Find matching resources
|
100
|
-
matching_resources = []
|
101
|
-
for resource in resources:
|
102
|
-
# Get the URL safely
|
103
|
-
resource_url = resource.get("url", "")
|
104
|
-
if not resource_url:
|
105
|
-
continue
|
106
|
-
|
107
|
-
resource_url = resource_url.lower()
|
108
|
-
|
109
|
-
# Check for matches with our patterns
|
110
|
-
if any(pattern in resource_url for pattern in country_patterns):
|
111
|
-
matching_resources.append(resource)
|
112
|
-
|
113
|
-
if not matching_resources:
|
114
|
-
self.logger.warning(
|
115
|
-
f"No resources matching country '{self.config.country}' were found. "
|
116
|
-
f"Consider downloading the full dataset with country=None and filtering afterwards."
|
117
|
-
)
|
118
|
-
return []
|
119
|
-
|
120
|
-
# Download the matching resources
|
121
|
-
downloaded_paths = []
|
122
|
-
for res in matching_resources:
|
123
|
-
try:
|
124
|
-
resource_name = res.get("name", "Unknown")
|
125
|
-
self.logger.info(f"Downloading resource: {resource_name}")
|
126
|
-
|
127
|
-
# Download to a temporary directory
|
128
|
-
with tempfile.TemporaryDirectory() as tmpdir:
|
129
|
-
url, local_path = res.download(folder=tmpdir)
|
130
|
-
# Read the file and write to the DataStore
|
131
|
-
with open(local_path, "rb") as f:
|
132
|
-
data = f.read()
|
133
|
-
# Compose the target path in the DataStore
|
134
|
-
target_path = str(
|
135
|
-
self.config.output_dir_path / Path(local_path).name
|
136
|
-
)
|
137
|
-
self.data_store.write_file(target_path, data)
|
138
|
-
downloaded_paths.append(target_path)
|
139
81
|
|
140
|
-
|
141
|
-
|
142
|
-
)
|
82
|
+
class RWIReader(HDXReader):
|
83
|
+
"""Specialized reader for the Relative Wealth Index dataset from HDX"""
|
143
84
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
85
|
+
def __init__(
|
86
|
+
self,
|
87
|
+
config: Union[RWIConfig, dict] = None,
|
88
|
+
data_store: Optional[DataStore] = None,
|
89
|
+
logger: Optional[logging.Logger] = None,
|
90
|
+
):
|
91
|
+
config = config if isinstance(config, RWIConfig) else RWIConfig(**config)
|
92
|
+
super().__init__(config=config, data_store=data_store, logger=logger)
|
149
93
|
|
150
|
-
return downloaded_paths
|
151
94
|
|
152
|
-
|
153
|
-
|
95
|
+
class RWIHandler(HDXHandler):
|
96
|
+
"""Handler for Relative Wealth Index dataset"""
|
154
97
|
|
155
|
-
|
156
|
-
|
157
|
-
|
98
|
+
def __init__(
|
99
|
+
self,
|
100
|
+
config: Optional[RWIConfig] = None,
|
101
|
+
downloader: Optional[RWIDownloader] = None,
|
102
|
+
reader: Optional[RWIReader] = None,
|
103
|
+
data_store: Optional[DataStore] = None,
|
104
|
+
logger: Optional[logging.Logger] = None,
|
105
|
+
**kwargs,
|
106
|
+
):
|
107
|
+
super().__init__(
|
108
|
+
dataset_name="relative-wealth-index",
|
109
|
+
config=config,
|
110
|
+
downloader=downloader,
|
111
|
+
reader=reader,
|
112
|
+
data_store=data_store,
|
113
|
+
logger=logger,
|
114
|
+
**kwargs,
|
115
|
+
)
|
116
|
+
|
117
|
+
def create_config(
|
118
|
+
self, data_store: DataStore, logger: logging.Logger, **kwargs
|
119
|
+
) -> RWIConfig:
|
120
|
+
"""Create and return a RWIConfig instance"""
|
121
|
+
return RWIConfig(
|
122
|
+
data_store=data_store,
|
123
|
+
logger=logger,
|
124
|
+
**kwargs,
|
125
|
+
)
|
126
|
+
|
127
|
+
def create_downloader(
|
128
|
+
self,
|
129
|
+
config: RWIConfig,
|
130
|
+
data_store: DataStore,
|
131
|
+
logger: logging.Logger,
|
132
|
+
**kwargs,
|
133
|
+
) -> RWIDownloader:
|
134
|
+
"""Create and return a RWIDownloader instance"""
|
135
|
+
return RWIDownloader(
|
136
|
+
config=config,
|
137
|
+
data_store=data_store,
|
138
|
+
logger=logger,
|
139
|
+
**kwargs,
|
140
|
+
)
|
141
|
+
|
142
|
+
def create_reader(
|
143
|
+
self,
|
144
|
+
config: RWIConfig,
|
145
|
+
data_store: DataStore,
|
146
|
+
logger: logging.Logger,
|
147
|
+
**kwargs,
|
148
|
+
) -> RWIReader:
|
149
|
+
"""Create and return a RWIReader instance"""
|
150
|
+
return RWIReader(
|
151
|
+
config=config,
|
152
|
+
data_store=data_store,
|
153
|
+
logger=logger,
|
154
|
+
**kwargs,
|
155
|
+
)
|
@@ -18,12 +18,12 @@ from gigaspatial.config import config
|
|
18
18
|
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
19
19
|
class TifProcessor:
|
20
20
|
"""
|
21
|
-
A class to handle tif data processing, supporting single-band, RGB, and
|
21
|
+
A class to handle tif data processing, supporting single-band, RGB, RGBA, and multi-band data.
|
22
22
|
"""
|
23
23
|
|
24
24
|
dataset_path: Union[Path, str]
|
25
25
|
data_store: Optional[DataStore] = None
|
26
|
-
mode: Literal["single", "rgb", "rgba"] = "single"
|
26
|
+
mode: Literal["single", "rgb", "rgba", "multi"] = "single"
|
27
27
|
|
28
28
|
def __post_init__(self):
|
29
29
|
"""Validate inputs and set up logging."""
|
@@ -36,10 +36,15 @@ class TifProcessor:
|
|
36
36
|
|
37
37
|
self._load_metadata()
|
38
38
|
|
39
|
+
# Validate mode and band count
|
39
40
|
if self.mode == "rgba" and self.count != 4:
|
40
41
|
raise ValueError("RGBA mode requires a 4-band TIF file")
|
41
42
|
if self.mode == "rgb" and self.count != 3:
|
42
43
|
raise ValueError("RGB mode requires a 3-band TIF file")
|
44
|
+
if self.mode == "single" and self.count != 1:
|
45
|
+
raise ValueError("Single mode requires a 1-band TIF file")
|
46
|
+
if self.mode == "multi" and self.count < 2:
|
47
|
+
raise ValueError("Multi mode requires a TIF file with 2 or more bands")
|
43
48
|
|
44
49
|
@contextmanager
|
45
50
|
def open_dataset(self):
|
@@ -118,6 +123,16 @@ class TifProcessor:
|
|
118
123
|
self._tabular = self._to_rgb_dataframe(drop_nodata=True)
|
119
124
|
elif self.mode == "rgba":
|
120
125
|
self._tabular = self._to_rgba_dataframe(drop_transparent=True)
|
126
|
+
elif self.mode == "multi":
|
127
|
+
self._tabular = self._to_multi_band_dataframe(
|
128
|
+
drop_nodata=True,
|
129
|
+
drop_values=[],
|
130
|
+
band_names=None, # Use default band naming
|
131
|
+
)
|
132
|
+
else:
|
133
|
+
raise ValueError(
|
134
|
+
f"Invalid mode: {self.mode}. Must be one of: single, rgb, rgba, multi"
|
135
|
+
)
|
121
136
|
except Exception as e:
|
122
137
|
raise ValueError(
|
123
138
|
f"Failed to process TIF file in mode '{self.mode}'. "
|
@@ -393,6 +408,77 @@ class TifProcessor:
|
|
393
408
|
self.logger.info("Dataset is processed!")
|
394
409
|
return data
|
395
410
|
|
411
|
+
def _to_multi_band_dataframe(
|
412
|
+
self,
|
413
|
+
drop_nodata: bool = True,
|
414
|
+
drop_values: list = [],
|
415
|
+
band_names: Optional[List[str]] = None,
|
416
|
+
) -> pd.DataFrame:
|
417
|
+
"""
|
418
|
+
Process multi-band TIF to DataFrame with all bands included.
|
419
|
+
|
420
|
+
Args:
|
421
|
+
drop_nodata (bool): Whether to drop nodata values. Defaults to True.
|
422
|
+
drop_values (list): Additional values to drop from the dataset. Defaults to empty list.
|
423
|
+
band_names (Optional[List[str]]): Custom names for the bands. If None, bands will be named using
|
424
|
+
the band descriptions from the GeoTIFF metadata if available,
|
425
|
+
otherwise 'band_1', 'band_2', etc.
|
426
|
+
|
427
|
+
Returns:
|
428
|
+
pd.DataFrame: DataFrame containing coordinates and all band values
|
429
|
+
"""
|
430
|
+
self.logger.info("Processing multi-band dataset...")
|
431
|
+
|
432
|
+
with self.open_dataset() as src:
|
433
|
+
# Read all bands
|
434
|
+
stack = src.read()
|
435
|
+
|
436
|
+
x_coords, y_coords = self._get_pixel_coordinates()
|
437
|
+
|
438
|
+
# Initialize dictionary with coordinates
|
439
|
+
data_dict = {"lon": x_coords.flatten(), "lat": y_coords.flatten()}
|
440
|
+
|
441
|
+
# Get band descriptions from metadata if available
|
442
|
+
if band_names is None and hasattr(src, "descriptions") and src.descriptions:
|
443
|
+
band_names = [
|
444
|
+
desc if desc else f"band_{i+1}"
|
445
|
+
for i, desc in enumerate(src.descriptions)
|
446
|
+
]
|
447
|
+
|
448
|
+
# Process each band
|
449
|
+
for band_idx in range(self.count):
|
450
|
+
band_data = stack[band_idx]
|
451
|
+
|
452
|
+
# Handle nodata and other values to drop
|
453
|
+
if drop_nodata or drop_values:
|
454
|
+
values_to_mask = []
|
455
|
+
if drop_nodata and src.nodata is not None:
|
456
|
+
values_to_mask.append(src.nodata)
|
457
|
+
if drop_values:
|
458
|
+
values_to_mask.extend(drop_values)
|
459
|
+
|
460
|
+
if values_to_mask:
|
461
|
+
data_mask = ~np.isin(band_data, values_to_mask)
|
462
|
+
band_values = np.extract(data_mask, band_data)
|
463
|
+
if band_idx == 0: # Only need to mask coordinates once
|
464
|
+
data_dict["lon"] = np.extract(data_mask, x_coords)
|
465
|
+
data_dict["lat"] = np.extract(data_mask, y_coords)
|
466
|
+
else:
|
467
|
+
band_values = band_data.flatten()
|
468
|
+
else:
|
469
|
+
band_values = band_data.flatten()
|
470
|
+
|
471
|
+
# Use custom band names if provided, otherwise use descriptions or default naming
|
472
|
+
band_name = (
|
473
|
+
band_names[band_idx]
|
474
|
+
if band_names and len(band_names) > band_idx
|
475
|
+
else f"band_{band_idx + 1}"
|
476
|
+
)
|
477
|
+
data_dict[band_name] = band_values
|
478
|
+
|
479
|
+
self.logger.info("Multi-band dataset is processed!")
|
480
|
+
return pd.DataFrame(data_dict)
|
481
|
+
|
396
482
|
def _get_pixel_coordinates(self):
|
397
483
|
"""Helper method to generate coordinate arrays for all pixels"""
|
398
484
|
if "pixel_coords" not in self._cache:
|
File without changes
|
File without changes
|
File without changes
|