giga-spatial 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. giga_spatial-0.6.0.dist-info/METADATA +141 -0
  2. giga_spatial-0.6.0.dist-info/RECORD +47 -0
  3. giga_spatial-0.6.0.dist-info/WHEEL +5 -0
  4. giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
  5. giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
  6. gigaspatial/__init__.py +1 -0
  7. gigaspatial/config.py +226 -0
  8. gigaspatial/core/__init__.py +0 -0
  9. gigaspatial/core/io/__init__.py +5 -0
  10. gigaspatial/core/io/adls_data_store.py +325 -0
  11. gigaspatial/core/io/data_api.py +113 -0
  12. gigaspatial/core/io/data_store.py +147 -0
  13. gigaspatial/core/io/local_data_store.py +92 -0
  14. gigaspatial/core/io/readers.py +265 -0
  15. gigaspatial/core/io/writers.py +128 -0
  16. gigaspatial/core/schemas/__init__.py +0 -0
  17. gigaspatial/core/schemas/entity.py +244 -0
  18. gigaspatial/generators/__init__.py +2 -0
  19. gigaspatial/generators/poi.py +636 -0
  20. gigaspatial/generators/zonal/__init__.py +3 -0
  21. gigaspatial/generators/zonal/base.py +370 -0
  22. gigaspatial/generators/zonal/geometry.py +439 -0
  23. gigaspatial/generators/zonal/mercator.py +78 -0
  24. gigaspatial/grid/__init__.py +1 -0
  25. gigaspatial/grid/mercator_tiles.py +286 -0
  26. gigaspatial/handlers/__init__.py +40 -0
  27. gigaspatial/handlers/base.py +761 -0
  28. gigaspatial/handlers/boundaries.py +305 -0
  29. gigaspatial/handlers/ghsl.py +772 -0
  30. gigaspatial/handlers/giga.py +145 -0
  31. gigaspatial/handlers/google_open_buildings.py +472 -0
  32. gigaspatial/handlers/hdx.py +241 -0
  33. gigaspatial/handlers/mapbox_image.py +208 -0
  34. gigaspatial/handlers/maxar_image.py +291 -0
  35. gigaspatial/handlers/microsoft_global_buildings.py +548 -0
  36. gigaspatial/handlers/ookla_speedtest.py +199 -0
  37. gigaspatial/handlers/opencellid.py +290 -0
  38. gigaspatial/handlers/osm.py +356 -0
  39. gigaspatial/handlers/overture.py +126 -0
  40. gigaspatial/handlers/rwi.py +157 -0
  41. gigaspatial/handlers/unicef_georepo.py +806 -0
  42. gigaspatial/handlers/worldpop.py +266 -0
  43. gigaspatial/processing/__init__.py +4 -0
  44. gigaspatial/processing/geo.py +1054 -0
  45. gigaspatial/processing/sat_images.py +39 -0
  46. gigaspatial/processing/tif_processor.py +477 -0
  47. gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,356 @@
1
+ import requests
2
+ import pandas as pd
3
+ from typing import List, Dict, Union, Optional, Literal
4
+ from dataclasses import dataclass
5
+ from time import sleep
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from requests.exceptions import RequestException
8
+ from shapely.geometry import Polygon, Point
9
+ import pycountry
10
+
11
+ from gigaspatial.config import config
12
+
13
+
14
+ @dataclass
15
+ class OSMLocationFetcher:
16
+ """
17
+ A class to fetch and process location data from OpenStreetMap using the Overpass API.
18
+
19
+ This class supports fetching various OSM location types including amenities, buildings,
20
+ shops, and other POI categories.
21
+ """
22
+
23
+ country: str
24
+ location_types: Union[List[str], Dict[str, List[str]]]
25
+ base_url: str = "http://overpass-api.de/api/interpreter"
26
+ timeout: int = 600
27
+ max_retries: int = 3
28
+ retry_delay: int = 5
29
+
30
+ def __post_init__(self):
31
+ """Validate inputs, normalize location_types, and set up logging."""
32
+ try:
33
+ self.country = pycountry.countries.lookup(self.country).alpha_2
34
+ except LookupError:
35
+ raise ValueError(f"Invalid country code provided: {self.country}")
36
+
37
+ # Normalize location_types to always be a dictionary
38
+ if isinstance(self.location_types, list):
39
+ self.location_types = {"amenity": self.location_types}
40
+ elif not isinstance(self.location_types, dict):
41
+ raise TypeError(
42
+ "location_types must be a list of strings or a dictionary mapping categories to type lists"
43
+ )
44
+
45
+ self.logger = config.get_logger(self.__class__.__name__)
46
+
47
+ def _build_queries(self, since_year: Optional[int] = None) -> List[str]:
48
+ """
49
+ Construct separate Overpass QL queries for different element types and categories.
50
+ Returns list of [nodes_relations_query, ways_query]
51
+ """
52
+ if since_year:
53
+ date_filter = f'(newer:"{since_year}-01-01T00:00:00Z")'
54
+ else:
55
+ date_filter = ""
56
+
57
+ # Query for nodes and relations (with center output)
58
+ nodes_relations_queries = []
59
+ for category, types in self.location_types.items():
60
+ nodes_relations_queries.extend(
61
+ [
62
+ f"""node["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
63
+ f"""relation["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
64
+ ]
65
+ )
66
+
67
+ nodes_relations_queries = "\n".join(nodes_relations_queries)
68
+
69
+ nodes_relations_query = f"""
70
+ [out:json][timeout:{self.timeout}];
71
+ area["ISO3166-1"={self.country}]->.searchArea;
72
+ (
73
+ {nodes_relations_queries}
74
+ );
75
+ out center;
76
+ """
77
+
78
+ # Query for ways (with geometry output)
79
+ ways_queries = []
80
+ for category, types in self.location_types.items():
81
+ ways_queries.append(
82
+ f"""way["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);"""
83
+ )
84
+
85
+ ways_queries = "\n".join(ways_queries)
86
+
87
+ ways_query = f"""
88
+ [out:json][timeout:{self.timeout}];
89
+ area["ISO3166-1"={self.country}]->.searchArea;
90
+ (
91
+ {ways_queries}
92
+ );
93
+ out geom;
94
+ """
95
+
96
+ return [nodes_relations_query, ways_query]
97
+
98
+ def _make_request(self, query: str) -> Dict:
99
+ """Make HTTP request to Overpass API with retry mechanism."""
100
+ for attempt in range(self.max_retries):
101
+ try:
102
+ self.logger.debug(f"Executing query:\n{query}")
103
+ response = requests.get(
104
+ self.base_url, params={"data": query}, timeout=self.timeout
105
+ )
106
+ response.raise_for_status()
107
+ return response.json()
108
+ except RequestException as e:
109
+ self.logger.warning(f"Attempt {attempt + 1} failed: {str(e)}")
110
+ if attempt < self.max_retries - 1:
111
+ sleep(self.retry_delay)
112
+ else:
113
+ raise RuntimeError(
114
+ f"Failed to fetch data after {self.max_retries} attempts"
115
+ ) from e
116
+
117
+ def _extract_matching_categories(self, tags: Dict[str, str]) -> Dict[str, str]:
118
+ """
119
+ Extract all matching categories and their values from the tags.
120
+ Returns:
121
+ Dict mapping each matching category to its value
122
+ """
123
+ matches = {}
124
+ for category, types in self.location_types.items():
125
+ if category in tags and tags[category] in types:
126
+ matches[category] = tags[category]
127
+ return matches
128
+
129
+ def _process_node_relation(self, element: Dict) -> List[Dict[str, any]]:
130
+ """
131
+ Process a node or relation element.
132
+ May return multiple processed elements if the element matches multiple categories.
133
+ """
134
+ try:
135
+ tags = element.get("tags", {})
136
+ matching_categories = self._extract_matching_categories(tags)
137
+
138
+ if not matching_categories:
139
+ self.logger.warning(
140
+ f"Element {element['id']} missing or not matching specified category tags"
141
+ )
142
+ return []
143
+
144
+ _lat = element.get("lat") or element["center"]["lat"]
145
+ _lon = element.get("lon") or element["center"]["lon"]
146
+ point_geom = Point(_lon, _lat)
147
+
148
+ # for each matching category, create a separate element
149
+ results = []
150
+ for category, value in matching_categories.items():
151
+ results.append(
152
+ {
153
+ "source_id": element["id"],
154
+ "category": category,
155
+ "category_value": value,
156
+ "name": tags.get("name", ""),
157
+ "name_en": tags.get("name:en", ""),
158
+ "type": element["type"],
159
+ "geometry": point_geom,
160
+ "latitude": _lat,
161
+ "longitude": _lon,
162
+ "matching_categories": list(matching_categories.keys()),
163
+ }
164
+ )
165
+
166
+ return results
167
+
168
+ except KeyError as e:
169
+ self.logger.error(f"Corrupt data received for node/relation: {str(e)}")
170
+ return []
171
+
172
+ def _process_way(self, element: Dict) -> List[Dict[str, any]]:
173
+ """
174
+ Process a way element with geometry.
175
+ May return multiple processed elements if the element matches multiple categories.
176
+ """
177
+ try:
178
+ tags = element.get("tags", {})
179
+ matching_categories = self._extract_matching_categories(tags)
180
+
181
+ if not matching_categories:
182
+ self.logger.warning(
183
+ f"Element {element['id']} missing or not matching specified category tags"
184
+ )
185
+ return []
186
+
187
+ # Create polygon from geometry points
188
+ polygon = Polygon([(p["lon"], p["lat"]) for p in element["geometry"]])
189
+ centroid = polygon.centroid
190
+
191
+ # For each matching category, create a separate element
192
+ results = []
193
+ for category, value in matching_categories.items():
194
+ results.append(
195
+ {
196
+ "source_id": element["id"],
197
+ "category": category,
198
+ "category_value": value,
199
+ "name": tags.get("name", ""),
200
+ "name_en": tags.get("name:en", ""),
201
+ "type": element["type"],
202
+ "geometry": polygon,
203
+ "latitude": centroid.y,
204
+ "longitude": centroid.x,
205
+ "matching_categories": list(matching_categories.keys()),
206
+ }
207
+ )
208
+
209
+ return results
210
+ except (KeyError, ValueError) as e:
211
+ self.logger.error(f"Error processing way geometry: {str(e)}")
212
+ return []
213
+
214
+ def fetch_locations(
215
+ self,
216
+ since_year: Optional[int] = None,
217
+ handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
218
+ ) -> pd.DataFrame:
219
+ """
220
+ Fetch and process OSM locations.
221
+
222
+ Args:
223
+ since_year (int, optional): Filter for locations added/modified since this year.
224
+ handle_duplicates (str): How to handle objects matching multiple categories:
225
+ - 'separate': Create separate entries for each category (default)
226
+ - 'combine': Use a single entry with a list of matching categories
227
+ - 'primary': Keep only the first matching category
228
+
229
+ Returns:
230
+ pd.DataFrame: Processed OSM locations
231
+ """
232
+ if handle_duplicates not in ("separate", "combine", "primary"):
233
+ raise ValueError(
234
+ "handle_duplicates must be one of: 'separate', 'combine', 'primary'"
235
+ )
236
+
237
+ self.logger.info(
238
+ f"Fetching OSM locations from Overpass API for country: {self.country}"
239
+ )
240
+ self.logger.info(f"Location types: {self.location_types}")
241
+ self.logger.info(f"Handling duplicate category matches as: {handle_duplicates}")
242
+
243
+ # Get queries for different element types
244
+ nodes_relations_query, ways_query = self._build_queries(since_year)
245
+
246
+ # Fetch nodes and relations
247
+ nodes_relations_response = self._make_request(nodes_relations_query)
248
+ nodes_relations = nodes_relations_response.get("elements", [])
249
+
250
+ # Fetch ways
251
+ ways_response = self._make_request(ways_query)
252
+ ways = ways_response.get("elements", [])
253
+
254
+ if not nodes_relations and not ways:
255
+ self.logger.warning("No locations found for the specified criteria")
256
+ return pd.DataFrame()
257
+
258
+ self.logger.info(
259
+ f"Processing {len(nodes_relations)} nodes/relations and {len(ways)} ways..."
260
+ )
261
+
262
+ # Process nodes and relations
263
+ with ThreadPoolExecutor() as executor:
264
+ processed_nodes_relations = [
265
+ item
266
+ for sublist in executor.map(
267
+ self._process_node_relation, nodes_relations
268
+ )
269
+ for item in sublist
270
+ ]
271
+
272
+ # Process ways
273
+ with ThreadPoolExecutor() as executor:
274
+ processed_ways = [
275
+ item
276
+ for sublist in executor.map(self._process_way, ways)
277
+ for item in sublist
278
+ ]
279
+
280
+ # Combine all processed elements
281
+ all_elements = processed_nodes_relations + processed_ways
282
+
283
+ if not all_elements:
284
+ self.logger.warning("No matching elements found after processing")
285
+ return pd.DataFrame()
286
+
287
+ # Handle duplicates based on the specified strategy
288
+ if handle_duplicates != "separate":
289
+ # Group by source_id
290
+ grouped_elements = {}
291
+ for elem in all_elements:
292
+ source_id = elem["source_id"]
293
+ if source_id not in grouped_elements:
294
+ grouped_elements[source_id] = elem
295
+ elif handle_duplicates == "combine":
296
+ # Combine matching categories
297
+ if grouped_elements[source_id]["category"] != elem["category"]:
298
+ if isinstance(grouped_elements[source_id]["category"], str):
299
+ grouped_elements[source_id]["category"] = [
300
+ grouped_elements[source_id]["category"]
301
+ ]
302
+ grouped_elements[source_id]["category_value"] = [
303
+ grouped_elements[source_id]["category_value"]
304
+ ]
305
+
306
+ if (
307
+ elem["category"]
308
+ not in grouped_elements[source_id]["category"]
309
+ ):
310
+ grouped_elements[source_id]["category"].append(
311
+ elem["category"]
312
+ )
313
+ grouped_elements[source_id]["category_value"].append(
314
+ elem["category_value"]
315
+ )
316
+ # For 'primary', just keep the first one we encountered
317
+
318
+ all_elements = list(grouped_elements.values())
319
+
320
+ locations = pd.DataFrame(all_elements)
321
+
322
+ # Log element type distribution
323
+ type_counts = locations["type"].value_counts()
324
+ self.logger.info("\nElement type distribution:")
325
+ for element_type, count in type_counts.items():
326
+ self.logger.info(f"{element_type}: {count}")
327
+
328
+ # Log category distribution
329
+ if handle_duplicates == "combine":
330
+ # Count each category separately when they're in lists
331
+ category_counts = {}
332
+ for cats in locations["category"]:
333
+ if isinstance(cats, list):
334
+ for cat in cats:
335
+ category_counts[cat] = category_counts.get(cat, 0) + 1
336
+ else:
337
+ category_counts[cats] = category_counts.get(cats, 0) + 1
338
+
339
+ self.logger.info("\nCategory distribution:")
340
+ for category, count in category_counts.items():
341
+ self.logger.info(f"{category}: {count}")
342
+ else:
343
+ category_counts = locations["category"].value_counts()
344
+ self.logger.info("\nCategory distribution:")
345
+ for category, count in category_counts.items():
346
+ self.logger.info(f"{category}: {count}")
347
+
348
+ # Log elements with multiple matching categories
349
+ multi_category = [e for e in all_elements if len(e["matching_categories"]) > 1]
350
+ if multi_category:
351
+ self.logger.info(
352
+ f"\n{len(multi_category)} elements matched multiple categories"
353
+ )
354
+
355
+ self.logger.info(f"Successfully processed {len(locations)} locations")
356
+ return locations
@@ -0,0 +1,126 @@
1
+ # import requests
2
+ import geopandas as gpd
3
+ from typing import List, Optional, Union
4
+ from pydantic.dataclasses import dataclass, Field
5
+ from pydantic import ConfigDict
6
+ from shapely.geometry import Polygon, MultiPolygon
7
+ from shapely.strtree import STRtree
8
+ from pathlib import Path
9
+ import pycountry
10
+ import duckdb
11
+
12
+ from gigaspatial.config import config
13
+ from gigaspatial.handlers.boundaries import AdminBoundaries
14
+ from gigaspatial.core.io.data_store import DataStore
15
+
16
+
17
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
18
+ class OvertureAmenityFetcher:
19
+ """
20
+ A class to fetch and process amenity locations from Overture.
21
+ """
22
+
23
+ # constants
24
+ release: Optional[str] = "2024-12-18.0"
25
+ base_url: Optional[str] = (
26
+ "s3://overturemaps-us-west-2/release/{release}/theme=places/*/*"
27
+ )
28
+
29
+ # user config
30
+ country: str = Field(...)
31
+ amenity_types: List[str] = Field(..., description="List of amenity types to fetch")
32
+ geom: Union[Polygon, MultiPolygon] = None
33
+
34
+ # config for country boundary access from data storage
35
+ # if None GADM boundaries will be used
36
+ data_store: DataStore = None
37
+ country_geom_path: Optional[Union[str, Path]] = None
38
+
39
+ def __post_init__(self):
40
+ """Validate inputs and set up logging."""
41
+ try:
42
+ self.country = pycountry.countries.lookup(self.country).alpha_2
43
+ except LookupError:
44
+ raise ValueError(f"Invalid country code provided: {self.country}")
45
+
46
+ self.base_url = self.base_url.format(release=self.release)
47
+ self.logger = config.get_logger(self.__class__.__name__)
48
+
49
+ self.connection = self._set_connection()
50
+
51
+ def _set_connection(self):
52
+ """Set the connection to the DB"""
53
+ db = duckdb.connect()
54
+ db.install_extension("spatial")
55
+ db.load_extension("spatial")
56
+ return db
57
+
58
+ def _load_country_geometry(
59
+ self,
60
+ ) -> Union[Polygon, MultiPolygon]:
61
+ """Load country boundary geometry from DataStore or GADM."""
62
+
63
+ gdf_admin0 = AdminBoundaries.create(
64
+ country_code=pycountry.countries.lookup(self.country).alpha_3,
65
+ admin_level=0,
66
+ data_store=self.data_store,
67
+ path=self.country_geom_path,
68
+ ).to_geodataframe()
69
+
70
+ return gdf_admin0.geometry.iloc[0]
71
+
72
+ def _build_query(self, match_pattern: bool = False, **kwargs) -> str:
73
+ """Constructs and returns the query"""
74
+
75
+ if match_pattern:
76
+ amenity_query = " OR ".join(
77
+ [f"category ilike '%{amenity}%'" for amenity in self.amenity_types]
78
+ )
79
+ else:
80
+ amenity_query = " OR ".join(
81
+ [f"category == '{amenity}'" for amenity in self.amenity_types]
82
+ )
83
+
84
+ query = """
85
+ SELECT id,
86
+ names.primary AS name,
87
+ ROUND(confidence,2) as confidence,
88
+ categories.primary AS category,
89
+ ST_AsText(geometry) as geometry,
90
+ FROM read_parquet('s3://overturemaps-us-west-2/release/2024-12-18.0/theme=places/type=place/*',
91
+ hive_partitioning=1)
92
+ WHERE bbox.xmin > {}
93
+ AND bbox.ymin > {}
94
+ AND bbox.xmax < {}
95
+ AND bbox.ymax < {}
96
+ AND ({})
97
+ """
98
+
99
+ if not self.geom:
100
+ self.geom = self._load_country_geometry()
101
+
102
+ return query.format(*self.geom.bounds, amenity_query)
103
+
104
+ def fetch_locations(
105
+ self, match_pattern: bool = False, **kwargs
106
+ ) -> gpd.GeoDataFrame:
107
+ """Fetch and process amenity locations."""
108
+ self.logger.info("Fetching amenity locations from Overture DB...")
109
+
110
+ query = self._build_query(match_pattern=match_pattern, **kwargs)
111
+
112
+ df = self.connection.execute(query).df()
113
+
114
+ self.logger.info("Processing geometries")
115
+ gdf = gpd.GeoDataFrame(
116
+ df, geometry=gpd.GeoSeries.from_wkt(df["geometry"]), crs="EPSG:4326"
117
+ )
118
+
119
+ # filter by geometry boundary
120
+ s = STRtree(gdf.geometry)
121
+ result = s.query(self.geom, predicate="intersects")
122
+
123
+ locations = gdf.iloc[result].reset_index(drop=True)
124
+
125
+ self.logger.info(f"Successfully processed {len(locations)} amenity locations")
126
+ return locations
@@ -0,0 +1,157 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import List, Optional, Union, Dict, Any, Literal
4
+ import pycountry
5
+ import tempfile
6
+
7
+ from pydantic import Field, field_validator
8
+
9
+ from gigaspatial.core.io.data_store import DataStore
10
+ from gigaspatial.handlers.hdx import HDXConfig, HDXDownloader
11
+
12
+
13
+ class RWIConfig(HDXConfig):
14
+ """Configuration for Relative Wealth Index data access"""
15
+
16
+ # Override dataset_name to be fixed for RWI
17
+ dataset_name: Literal["relative-wealth-index"] = Field(
18
+ default="relative-wealth-index"
19
+ )
20
+
21
+ # Additional RWI-specific configurations
22
+ country: Optional[str] = Field(
23
+ default=None, description="Country ISO code to filter data for"
24
+ )
25
+
26
+ @field_validator("country")
27
+ def validate_country(cls, value: str) -> str:
28
+ try:
29
+ return pycountry.countries.lookup(value).alpha_3
30
+ except LookupError:
31
+ raise ValueError(f"Invalid country code provided: {value}")
32
+
33
+
34
+ class RelativeWealthIndexDownloader(HDXDownloader):
35
+ """Specialized downloader for the Relative Wealth Index dataset from HDX"""
36
+
37
+ def __init__(
38
+ self,
39
+ config: Union[RWIConfig, dict] = None,
40
+ data_store: Optional[DataStore] = None,
41
+ logger: Optional[logging.Logger] = None,
42
+ ):
43
+ if config is None:
44
+ config = RWIConfig()
45
+ elif isinstance(config, dict):
46
+ config = RWIConfig(**config)
47
+
48
+ super().__init__(config=config, data_store=data_store, logger=logger)
49
+
50
+ @classmethod
51
+ def from_config(
52
+ cls,
53
+ country: Optional[str] = None,
54
+ **kwargs,
55
+ ):
56
+ """Create a downloader with RWI-specific configurations"""
57
+ config = RWIConfig(country=country, **kwargs)
58
+ return cls(config=config)
59
+
60
+ def download_dataset(self) -> List[str]:
61
+ """Download RWI dataset, optionally filtering for a specific country.
62
+
63
+ If country is specified, attempts to find and download only the resources
64
+ relevant to that country. Otherwise, downloads all RWI resources.
65
+
66
+ Returns:
67
+ List of paths to the downloaded files
68
+ """
69
+ # If no country specified, download all resources
70
+ if self.config.country is None:
71
+ return super().download_dataset()
72
+
73
+ # Get all resources from the dataset
74
+ try:
75
+ resources = self.get_dataset_resources()
76
+ if not resources:
77
+ self.logger.warning(f"No resources found for RWI dataset")
78
+ return []
79
+
80
+ # Prepare country identifiers for matching
81
+ country_code = self.config.country.lower()
82
+ country_name = pycountry.countries.lookup(self.config.country).name.lower()
83
+ country_alpha2 = pycountry.countries.lookup(
84
+ self.config.country
85
+ ).alpha_2.lower()
86
+
87
+ # Try different matching patterns
88
+ country_patterns = [
89
+ f"/{country_code}_", # URL path with ISO3 prefix
90
+ f"/{country_code}.", # URL path with ISO3 followed by extension
91
+ f"_{country_code}_", # Filename with ISO3 in middle
92
+ f"_{country_code}.", # Filename with ISO3 at end
93
+ f"/{country_name.replace(' ', '')}_", # URL with no spaces
94
+ f"/{country_name.replace(' ', '-')}_", # URL with hyphens
95
+ f"/{country_alpha2}_", # URL with ISO2 code
96
+ country_name, # Country name anywhere in URL
97
+ ]
98
+
99
+ # Find matching resources
100
+ matching_resources = []
101
+ for resource in resources:
102
+ # Get the URL safely
103
+ resource_url = resource.get("url", "")
104
+ if not resource_url:
105
+ continue
106
+
107
+ resource_url = resource_url.lower()
108
+
109
+ # Check for matches with our patterns
110
+ if any(pattern in resource_url for pattern in country_patterns):
111
+ matching_resources.append(resource)
112
+
113
+ if not matching_resources:
114
+ self.logger.warning(
115
+ f"No resources matching country '{self.config.country}' were found. "
116
+ f"Consider downloading the full dataset with country=None and filtering afterwards."
117
+ )
118
+ return []
119
+
120
+ # Download the matching resources
121
+ downloaded_paths = []
122
+ for res in matching_resources:
123
+ try:
124
+ resource_name = res.get("name", "Unknown")
125
+ self.logger.info(f"Downloading resource: {resource_name}")
126
+
127
+ # Download to a temporary directory
128
+ with tempfile.TemporaryDirectory() as tmpdir:
129
+ url, local_path = res.download(folder=tmpdir)
130
+ # Read the file and write to the DataStore
131
+ with open(local_path, "rb") as f:
132
+ data = f.read()
133
+ # Compose the target path in the DataStore
134
+ target_path = str(
135
+ self.config.output_dir_path / Path(local_path).name
136
+ )
137
+ self.data_store.write_file(target_path, data)
138
+ downloaded_paths.append(target_path)
139
+
140
+ self.logger.info(
141
+ f"Downloaded resource: {resource_name} to {target_path}"
142
+ )
143
+
144
+ except Exception as e:
145
+ resource_name = res.get("name", "Unknown")
146
+ self.logger.error(
147
+ f"Error downloading resource {resource_name}: {str(e)}"
148
+ )
149
+
150
+ return downloaded_paths
151
+
152
+ except Exception as e:
153
+ self.logger.error(f"Error during country-filtered download: {str(e)}")
154
+
155
+ # Fall back to downloading all resources
156
+ self.logger.info("Falling back to downloading all RWI resources")
157
+ return super().download_dataset()