giga-spatial 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,17 @@
1
1
  import logging
2
- from pathlib import Path
3
- from typing import List, Optional, Union, Dict, Any, Literal
4
- import pycountry
5
- import tempfile
2
+ from typing import List, Optional, Union, Literal
3
+ from pydantic.dataclasses import dataclass
4
+ from datetime import datetime
6
5
 
7
- from pydantic import Field, field_validator
6
+ from hdx.data.resource import Resource
7
+
8
+ from pydantic import Field, ConfigDict
8
9
 
9
10
  from gigaspatial.core.io.data_store import DataStore
10
- from gigaspatial.handlers.hdx import HDXConfig, HDXDownloader
11
+ from gigaspatial.handlers.hdx import HDXConfig, HDXDownloader, HDXReader, HDXHandler
11
12
 
12
13
 
14
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
13
15
  class RWIConfig(HDXConfig):
14
16
  """Configuration for Relative Wealth Index data access"""
15
17
 
@@ -22,16 +24,49 @@ class RWIConfig(HDXConfig):
22
24
  country: Optional[str] = Field(
23
25
  default=None, description="Country ISO code to filter data for"
24
26
  )
27
+ latest_only: bool = Field(
28
+ default=True,
29
+ description="If True, only get the latest resource for each country",
30
+ )
31
+
32
+ def __post_init__(self):
33
+ super().__post_init__()
34
+
35
+ def get_relevant_data_units_by_country(
36
+ self, country: str, **kwargs
37
+ ) -> List[Resource]:
38
+ """Get relevant data units for a country, optionally filtering for latest version"""
39
+ resources = super().get_relevant_data_units_by_country(
40
+ country=country, key="url"
41
+ )
42
+
43
+ if self.latest_only and len(resources) > 1:
44
+ # Find the resource with the latest creation date
45
+ latest_resource = None
46
+ latest_date = None
47
+
48
+ for resource in resources:
49
+ created = resource.get("created")
50
+ if created:
51
+ try:
52
+ created_dt = datetime.fromisoformat(
53
+ created.replace("Z", "+00:00")
54
+ )
55
+ if latest_date is None or created_dt > latest_date:
56
+ latest_date = created_dt
57
+ latest_resource = resource
58
+ except ValueError:
59
+ self.logger.warning(
60
+ f"Could not parse creation date for resource: {created}"
61
+ )
62
+
63
+ if latest_resource:
64
+ resources = [latest_resource]
25
65
 
26
- @field_validator("country")
27
- def validate_country(cls, value: str) -> str:
28
- try:
29
- return pycountry.countries.lookup(value).alpha_3
30
- except LookupError:
31
- raise ValueError(f"Invalid country code provided: {value}")
66
+ return resources
32
67
 
33
68
 
34
- class RelativeWealthIndexDownloader(HDXDownloader):
69
+ class RWIDownloader(HDXDownloader):
35
70
  """Specialized downloader for the Relative Wealth Index dataset from HDX"""
36
71
 
37
72
  def __init__(
@@ -40,118 +75,81 @@ class RelativeWealthIndexDownloader(HDXDownloader):
40
75
  data_store: Optional[DataStore] = None,
41
76
  logger: Optional[logging.Logger] = None,
42
77
  ):
43
- if config is None:
44
- config = RWIConfig()
45
- elif isinstance(config, dict):
46
- config = RWIConfig(**config)
47
-
78
+ config = config if isinstance(config, RWIConfig) else RWIConfig(**config)
48
79
  super().__init__(config=config, data_store=data_store, logger=logger)
49
80
 
50
- @classmethod
51
- def from_config(
52
- cls,
53
- country: Optional[str] = None,
54
- **kwargs,
55
- ):
56
- """Create a downloader with RWI-specific configurations"""
57
- config = RWIConfig(country=country, **kwargs)
58
- return cls(config=config)
59
-
60
- def download_dataset(self) -> List[str]:
61
- """Download RWI dataset, optionally filtering for a specific country.
62
-
63
- If country is specified, attempts to find and download only the resources
64
- relevant to that country. Otherwise, downloads all RWI resources.
65
-
66
- Returns:
67
- List of paths to the downloaded files
68
- """
69
- # If no country specified, download all resources
70
- if self.config.country is None:
71
- return super().download_dataset()
72
-
73
- # Get all resources from the dataset
74
- try:
75
- resources = self.get_dataset_resources()
76
- if not resources:
77
- self.logger.warning(f"No resources found for RWI dataset")
78
- return []
79
-
80
- # Prepare country identifiers for matching
81
- country_code = self.config.country.lower()
82
- country_name = pycountry.countries.lookup(self.config.country).name.lower()
83
- country_alpha2 = pycountry.countries.lookup(
84
- self.config.country
85
- ).alpha_2.lower()
86
-
87
- # Try different matching patterns
88
- country_patterns = [
89
- f"/{country_code}_", # URL path with ISO3 prefix
90
- f"/{country_code}.", # URL path with ISO3 followed by extension
91
- f"_{country_code}_", # Filename with ISO3 in middle
92
- f"_{country_code}.", # Filename with ISO3 at end
93
- f"/{country_name.replace(' ', '')}_", # URL with no spaces
94
- f"/{country_name.replace(' ', '-')}_", # URL with hyphens
95
- f"/{country_alpha2}_", # URL with ISO2 code
96
- country_name, # Country name anywhere in URL
97
- ]
98
-
99
- # Find matching resources
100
- matching_resources = []
101
- for resource in resources:
102
- # Get the URL safely
103
- resource_url = resource.get("url", "")
104
- if not resource_url:
105
- continue
106
-
107
- resource_url = resource_url.lower()
108
-
109
- # Check for matches with our patterns
110
- if any(pattern in resource_url for pattern in country_patterns):
111
- matching_resources.append(resource)
112
-
113
- if not matching_resources:
114
- self.logger.warning(
115
- f"No resources matching country '{self.config.country}' were found. "
116
- f"Consider downloading the full dataset with country=None and filtering afterwards."
117
- )
118
- return []
119
-
120
- # Download the matching resources
121
- downloaded_paths = []
122
- for res in matching_resources:
123
- try:
124
- resource_name = res.get("name", "Unknown")
125
- self.logger.info(f"Downloading resource: {resource_name}")
126
-
127
- # Download to a temporary directory
128
- with tempfile.TemporaryDirectory() as tmpdir:
129
- url, local_path = res.download(folder=tmpdir)
130
- # Read the file and write to the DataStore
131
- with open(local_path, "rb") as f:
132
- data = f.read()
133
- # Compose the target path in the DataStore
134
- target_path = str(
135
- self.config.output_dir_path / Path(local_path).name
136
- )
137
- self.data_store.write_file(target_path, data)
138
- downloaded_paths.append(target_path)
139
81
 
140
- self.logger.info(
141
- f"Downloaded resource: {resource_name} to {target_path}"
142
- )
82
+ class RWIReader(HDXReader):
83
+ """Specialized reader for the Relative Wealth Index dataset from HDX"""
143
84
 
144
- except Exception as e:
145
- resource_name = res.get("name", "Unknown")
146
- self.logger.error(
147
- f"Error downloading resource {resource_name}: {str(e)}"
148
- )
85
+ def __init__(
86
+ self,
87
+ config: Union[RWIConfig, dict] = None,
88
+ data_store: Optional[DataStore] = None,
89
+ logger: Optional[logging.Logger] = None,
90
+ ):
91
+ config = config if isinstance(config, RWIConfig) else RWIConfig(**config)
92
+ super().__init__(config=config, data_store=data_store, logger=logger)
149
93
 
150
- return downloaded_paths
151
94
 
152
- except Exception as e:
153
- self.logger.error(f"Error during country-filtered download: {str(e)}")
95
+ class RWIHandler(HDXHandler):
96
+ """Handler for Relative Wealth Index dataset"""
154
97
 
155
- # Fall back to downloading all resources
156
- self.logger.info("Falling back to downloading all RWI resources")
157
- return super().download_dataset()
98
+ def __init__(
99
+ self,
100
+ config: Optional[RWIConfig] = None,
101
+ downloader: Optional[RWIDownloader] = None,
102
+ reader: Optional[RWIReader] = None,
103
+ data_store: Optional[DataStore] = None,
104
+ logger: Optional[logging.Logger] = None,
105
+ **kwargs,
106
+ ):
107
+ super().__init__(
108
+ dataset_name="relative-wealth-index",
109
+ config=config,
110
+ downloader=downloader,
111
+ reader=reader,
112
+ data_store=data_store,
113
+ logger=logger,
114
+ **kwargs,
115
+ )
116
+
117
+ def create_config(
118
+ self, data_store: DataStore, logger: logging.Logger, **kwargs
119
+ ) -> RWIConfig:
120
+ """Create and return a RWIConfig instance"""
121
+ return RWIConfig(
122
+ data_store=data_store,
123
+ logger=logger,
124
+ **kwargs,
125
+ )
126
+
127
+ def create_downloader(
128
+ self,
129
+ config: RWIConfig,
130
+ data_store: DataStore,
131
+ logger: logging.Logger,
132
+ **kwargs,
133
+ ) -> RWIDownloader:
134
+ """Create and return a RWIDownloader instance"""
135
+ return RWIDownloader(
136
+ config=config,
137
+ data_store=data_store,
138
+ logger=logger,
139
+ **kwargs,
140
+ )
141
+
142
+ def create_reader(
143
+ self,
144
+ config: RWIConfig,
145
+ data_store: DataStore,
146
+ logger: logging.Logger,
147
+ **kwargs,
148
+ ) -> RWIReader:
149
+ """Create and return a RWIReader instance"""
150
+ return RWIReader(
151
+ config=config,
152
+ data_store=data_store,
153
+ logger=logger,
154
+ **kwargs,
155
+ )
@@ -18,12 +18,12 @@ from gigaspatial.config import config
18
18
  @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
19
19
  class TifProcessor:
20
20
  """
21
- A class to handle tif data processing, supporting single-band, RGB, and RGBA data.
21
+ A class to handle tif data processing, supporting single-band, RGB, RGBA, and multi-band data.
22
22
  """
23
23
 
24
24
  dataset_path: Union[Path, str]
25
25
  data_store: Optional[DataStore] = None
26
- mode: Literal["single", "rgb", "rgba"] = "single"
26
+ mode: Literal["single", "rgb", "rgba", "multi"] = "single"
27
27
 
28
28
  def __post_init__(self):
29
29
  """Validate inputs and set up logging."""
@@ -36,10 +36,15 @@ class TifProcessor:
36
36
 
37
37
  self._load_metadata()
38
38
 
39
+ # Validate mode and band count
39
40
  if self.mode == "rgba" and self.count != 4:
40
41
  raise ValueError("RGBA mode requires a 4-band TIF file")
41
42
  if self.mode == "rgb" and self.count != 3:
42
43
  raise ValueError("RGB mode requires a 3-band TIF file")
44
+ if self.mode == "single" and self.count != 1:
45
+ raise ValueError("Single mode requires a 1-band TIF file")
46
+ if self.mode == "multi" and self.count < 2:
47
+ raise ValueError("Multi mode requires a TIF file with 2 or more bands")
43
48
 
44
49
  @contextmanager
45
50
  def open_dataset(self):
@@ -118,6 +123,16 @@ class TifProcessor:
118
123
  self._tabular = self._to_rgb_dataframe(drop_nodata=True)
119
124
  elif self.mode == "rgba":
120
125
  self._tabular = self._to_rgba_dataframe(drop_transparent=True)
126
+ elif self.mode == "multi":
127
+ self._tabular = self._to_multi_band_dataframe(
128
+ drop_nodata=True,
129
+ drop_values=[],
130
+ band_names=None, # Use default band naming
131
+ )
132
+ else:
133
+ raise ValueError(
134
+ f"Invalid mode: {self.mode}. Must be one of: single, rgb, rgba, multi"
135
+ )
121
136
  except Exception as e:
122
137
  raise ValueError(
123
138
  f"Failed to process TIF file in mode '{self.mode}'. "
@@ -393,6 +408,77 @@ class TifProcessor:
393
408
  self.logger.info("Dataset is processed!")
394
409
  return data
395
410
 
411
+ def _to_multi_band_dataframe(
412
+ self,
413
+ drop_nodata: bool = True,
414
+ drop_values: list = [],
415
+ band_names: Optional[List[str]] = None,
416
+ ) -> pd.DataFrame:
417
+ """
418
+ Process multi-band TIF to DataFrame with all bands included.
419
+
420
+ Args:
421
+ drop_nodata (bool): Whether to drop nodata values. Defaults to True.
422
+ drop_values (list): Additional values to drop from the dataset. Defaults to empty list.
423
+ band_names (Optional[List[str]]): Custom names for the bands. If None, bands will be named using
424
+ the band descriptions from the GeoTIFF metadata if available,
425
+ otherwise 'band_1', 'band_2', etc.
426
+
427
+ Returns:
428
+ pd.DataFrame: DataFrame containing coordinates and all band values
429
+ """
430
+ self.logger.info("Processing multi-band dataset...")
431
+
432
+ with self.open_dataset() as src:
433
+ # Read all bands
434
+ stack = src.read()
435
+
436
+ x_coords, y_coords = self._get_pixel_coordinates()
437
+
438
+ # Initialize dictionary with coordinates
439
+ data_dict = {"lon": x_coords.flatten(), "lat": y_coords.flatten()}
440
+
441
+ # Get band descriptions from metadata if available
442
+ if band_names is None and hasattr(src, "descriptions") and src.descriptions:
443
+ band_names = [
444
+ desc if desc else f"band_{i+1}"
445
+ for i, desc in enumerate(src.descriptions)
446
+ ]
447
+
448
+ # Process each band
449
+ for band_idx in range(self.count):
450
+ band_data = stack[band_idx]
451
+
452
+ # Handle nodata and other values to drop
453
+ if drop_nodata or drop_values:
454
+ values_to_mask = []
455
+ if drop_nodata and src.nodata is not None:
456
+ values_to_mask.append(src.nodata)
457
+ if drop_values:
458
+ values_to_mask.extend(drop_values)
459
+
460
+ if values_to_mask:
461
+ data_mask = ~np.isin(band_data, values_to_mask)
462
+ band_values = np.extract(data_mask, band_data)
463
+ if band_idx == 0: # Only need to mask coordinates once
464
+ data_dict["lon"] = np.extract(data_mask, x_coords)
465
+ data_dict["lat"] = np.extract(data_mask, y_coords)
466
+ else:
467
+ band_values = band_data.flatten()
468
+ else:
469
+ band_values = band_data.flatten()
470
+
471
+ # Use custom band names if provided, otherwise use descriptions or default naming
472
+ band_name = (
473
+ band_names[band_idx]
474
+ if band_names and len(band_names) > band_idx
475
+ else f"band_{band_idx + 1}"
476
+ )
477
+ data_dict[band_name] = band_values
478
+
479
+ self.logger.info("Multi-band dataset is processed!")
480
+ return pd.DataFrame(data_dict)
481
+
396
482
  def _get_pixel_coordinates(self):
397
483
  """Helper method to generate coordinate arrays for all pixels"""
398
484
  if "pixel_coords" not in self._cache: