giga-spatial 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,7 +10,7 @@ import pycountry
10
10
  from gigaspatial.core.io.data_store import DataStore
11
11
  from gigaspatial.core.io.readers import read_dataset
12
12
  from gigaspatial.handlers.hdx import HDXConfig
13
- from gigaspatial.config import config
13
+ from gigaspatial.config import config as global_config
14
14
 
15
15
 
16
16
  class AdminBoundary(BaseModel):
@@ -33,7 +33,6 @@ class AdminBoundary(BaseModel):
33
33
  )
34
34
 
35
35
  class Config:
36
- # extra = "allow"
37
36
  arbitrary_types_allowed = True
38
37
 
39
38
 
@@ -48,7 +47,7 @@ class AdminBoundaries(BaseModel):
48
47
  description="Administrative level (e.g., 0=country, 1=state, etc.)",
49
48
  )
50
49
 
51
- logger: ClassVar = config.get_logger("AdminBoundaries")
50
+ logger: ClassVar = global_config.get_logger("AdminBoundaries")
52
51
 
53
52
  _schema_config: ClassVar[Dict[str, Dict[str, str]]] = {
54
53
  "gadm": {
@@ -292,6 +291,56 @@ class AdminBoundaries(BaseModel):
292
291
  country_code, admin_level, "geoBoundaries"
293
292
  )
294
293
 
294
+ @classmethod
295
+ def from_global_country_boundaries(cls, scale: str = "medium") -> "AdminBoundaries":
296
+ """
297
+ Load global country boundaries from Natural Earth Data.
298
+
299
+ Args:
300
+ scale (str): One of 'large', 'medium', 'small'.
301
+ - 'large' -> 10m
302
+ - 'medium' -> 50m
303
+ - 'small' -> 110m
304
+ Returns:
305
+ AdminBoundaries: All country boundaries at admin_level=0
306
+ """
307
+ scale_map = {
308
+ "large": "10m",
309
+ "medium": "50m",
310
+ "small": "110m",
311
+ }
312
+ if scale not in scale_map:
313
+ raise ValueError(
314
+ f"Invalid scale '{scale}'. Choose from 'large', 'medium', 'small'."
315
+ )
316
+ scale_folder = scale_map[scale]
317
+ url = f"https://naciscdn.org/naturalearth/{scale_folder}/cultural/ne_{scale_folder}_admin_0_countries.zip"
318
+ cls.logger.info(f"Loading Natural Earth global country boundaries from {url}")
319
+ try:
320
+ gdf = gpd.read_file(url)
321
+ # Map fields to AdminBoundary schema
322
+ boundaries = []
323
+ for _, row in gdf.iterrows():
324
+ iso_a3 = row.get("ISO_A3_EH") or row.get("ISO_A3") or row.get("ADM0_A3")
325
+ name = row.get("NAME") or row.get("ADMIN") or row.get("SOVEREIGNT")
326
+ geometry = row.get("geometry")
327
+ if not iso_a3 or not name or geometry is None:
328
+ continue
329
+ boundary = AdminBoundary(
330
+ id=iso_a3,
331
+ name=name,
332
+ geometry=geometry,
333
+ country_code=iso_a3,
334
+ )
335
+ boundaries.append(boundary)
336
+ cls.logger.info(
337
+ f"Loaded {len(boundaries)} country boundaries from Natural Earth."
338
+ )
339
+ return cls(boundaries=boundaries, level=0)
340
+ except Exception as e:
341
+ cls.logger.error(f"Failed to load Natural Earth global boundaries: {e}")
342
+ raise
343
+
295
344
  @classmethod
296
345
  def create(
297
346
  cls,
@@ -301,28 +350,50 @@ class AdminBoundaries(BaseModel):
301
350
  path: Optional[Union[str, "Path"]] = None,
302
351
  **kwargs,
303
352
  ) -> "AdminBoundaries":
304
- """Factory method to create AdminBoundaries instance from either GADM or data store.
353
+ """
354
+ Factory method to create an AdminBoundaries instance using various data sources,
355
+ depending on the provided parameters and global configuration.
356
+
357
+ Loading Logic:
358
+ 1. If a `data_store` is provided and either a `path` is given or
359
+ `global_config.ADMIN_BOUNDARIES_DATA_DIR` is set:
360
+ - If `path` is not provided but `country_code` is, the path is constructed
361
+ using `global_config.get_admin_path()`.
362
+ - Loads boundaries from the specified data store and path.
363
+
364
+ 2. If only `country_code` is provided (no data_store):
365
+ - Attempts to load boundaries from GeoRepo (if available).
366
+ - If GeoRepo is unavailable, attempts to load from GADM.
367
+ - If GADM fails, falls back to geoBoundaries.
368
+ - Raises an error if all sources fail.
369
+
370
+ 3. If neither `country_code` nor `data_store` is provided:
371
+ - Raises a ValueError.
305
372
 
306
373
  Args:
307
- country_code: ISO country code (2 or 3 letter) or country name
308
- admin_level: Administrative level (0=country, 1=state/province, etc.)
309
- data_store: Optional data store instance for loading from existing data
310
- path: Optional path to data file (used with data_store)
311
- **kwargs: Additional arguments passed to the underlying creation methods
374
+ country_code (Optional[str]): ISO country code (2 or 3 letter) or country name.
375
+ admin_level (int): Administrative level (0=country, 1=state/province, etc.).
376
+ data_store (Optional[DataStore]): Optional data store instance for loading from existing data.
377
+ path (Optional[Union[str, Path]]): Optional path to data file (used with data_store).
378
+ **kwargs: Additional arguments passed to the underlying creation methods.
312
379
 
313
380
  Returns:
314
- AdminBoundaries: Configured instance
381
+ AdminBoundaries: Configured instance.
315
382
 
316
383
  Raises:
317
384
  ValueError: If neither country_code nor (data_store, path) are provided,
318
- or if country_code lookup fails
385
+ or if country_code lookup fails.
386
+ RuntimeError: If all data sources fail to load boundaries.
319
387
 
320
- Example:
321
- # From country code
322
- boundaries = AdminBoundaries.create(country_code="USA", admin_level=1)
388
+ Examples:
389
+ # Load from a data store (path auto-generated if not provided)
390
+ boundaries = AdminBoundaries.create(country_code="USA", admin_level=1, data_store=store)
323
391
 
324
- # From data store
392
+ # Load from a specific file in a data store
325
393
  boundaries = AdminBoundaries.create(data_store=store, path="data.shp")
394
+
395
+ # Load from online sources (GeoRepo, GADM, geoBoundaries)
396
+ boundaries = AdminBoundaries.create(country_code="USA", admin_level=1)
326
397
  """
327
398
  cls.logger.info(
328
399
  f"Creating AdminBoundaries instance. Country: {country_code}, "
@@ -330,17 +401,21 @@ class AdminBoundaries(BaseModel):
330
401
  f"path provided: {path is not None}"
331
402
  )
332
403
 
404
+ from_data_store = data_store is not None and (
405
+ global_config.ADMIN_BOUNDARIES_DATA_DIR is not None or path is not None
406
+ )
407
+
333
408
  # Validate input parameters
334
409
  if not country_code and not data_store:
335
410
  raise ValueError("Either country_code or data_store must be provided.")
336
411
 
337
- if data_store and not path and not country_code:
412
+ if from_data_store and not path and not country_code:
338
413
  raise ValueError(
339
414
  "If data_store is provided, either path or country_code must also be specified."
340
415
  )
341
416
 
342
417
  # Handle data store path first
343
- if data_store is not None:
418
+ if from_data_store:
344
419
  iso3_code = None
345
420
  if country_code:
346
421
  try:
@@ -350,7 +425,7 @@ class AdminBoundaries(BaseModel):
350
425
 
351
426
  # Generate path if not provided
352
427
  if path is None and iso3_code:
353
- path = config.get_admin_path(
428
+ path = global_config.get_admin_path(
354
429
  country_code=iso3_code,
355
430
  admin_level=admin_level,
356
431
  )
@@ -14,7 +14,6 @@ import requests
14
14
  from tqdm import tqdm
15
15
  import zipfile
16
16
  import tempfile
17
- import shutil
18
17
  from pydantic import (
19
18
  HttpUrl,
20
19
  Field,
@@ -25,8 +24,6 @@ from pydantic import (
25
24
  import logging
26
25
 
27
26
  from gigaspatial.core.io.data_store import DataStore
28
- from gigaspatial.core.io.local_data_store import LocalDataStore
29
- from gigaspatial.handlers.boundaries import AdminBoundaries
30
27
  from gigaspatial.processing.tif_processor import TifProcessor
31
28
  from gigaspatial.handlers.base import (
32
29
  BaseHandlerConfig,
@@ -241,8 +238,8 @@ class GHSLDataConfig(BaseHandlerConfig):
241
238
  ValueError: If the input `source` is not one of the supported types.
242
239
  """
243
240
  if isinstance(source, gpd.GeoDataFrame):
244
- # if source.crs != "EPSG:4326":
245
- # source = source.to_crs("EPSG:4326")
241
+ if source.crs != crs:
242
+ source = source.to_crs(crs)
246
243
  search_geom = source.geometry.unary_union
247
244
  elif isinstance(
248
245
  source,
@@ -273,7 +270,9 @@ class GHSLDataConfig(BaseHandlerConfig):
273
270
  tile_geom.intersects(search_geom) for tile_geom in self.tiles_gdf.geometry
274
271
  )
275
272
 
276
- return self.tiles_gdf.loc[mask, "tile_id"].to_list()
273
+ intersecting_tiles = self.tiles_gdf.loc[mask, "tile_id"].to_list()
274
+
275
+ return intersecting_tiles
277
276
 
278
277
  def _get_product_info(self) -> dict:
279
278
  """Generate and return common product information used in multiple methods."""
@@ -340,7 +339,7 @@ class GHSLDataDownloader(BaseHandlerDownloader):
340
339
 
341
340
  Args:
342
341
  tile_id: tile ID to process.
343
- extract: If True and the downloaded file is a zip, extract its contents. Defaults to False.
342
+ extract: If True and the downloaded file is a zip, extract its contents. Defaults to True.
344
343
  file_pattern: Optional regex pattern to filter extracted files (if extract=True).
345
344
  **kwargs: Additional parameters passed to download methods
346
345
 
@@ -356,14 +355,34 @@ class GHSLDataDownloader(BaseHandlerDownloader):
356
355
  return self._download_file(url, output_path)
357
356
 
358
357
  extracted_files: List[Path] = []
358
+ temp_downloaded_path: Optional[Path] = None
359
359
 
360
360
  try:
361
361
  with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_file:
362
- downloaded_path = self._download_file(url, Path(temp_file.name))
363
- if not downloaded_path:
364
- return None
362
+ temp_downloaded_path = Path(temp_file.name)
363
+ self.logger.debug(
364
+ f"Downloading {url} to temporary file: {temp_downloaded_path}"
365
+ )
366
+
367
+ response = requests.get(url, stream=True)
368
+ response.raise_for_status()
369
+
370
+ total_size = int(response.headers.get("content-length", 0))
371
+
372
+ with tqdm(
373
+ total=total_size,
374
+ unit="B",
375
+ unit_scale=True,
376
+ desc=f"Downloading {tile_id}",
377
+ ) as pbar:
378
+ for chunk in response.iter_content(chunk_size=8192):
379
+ if chunk:
380
+ temp_file.write(chunk)
381
+ pbar.update(len(chunk))
382
+
383
+ self.logger.info(f"Successfully downloaded temporary file!")
365
384
 
366
- with zipfile.ZipFile(str(downloaded_path), "r") as zip_ref:
385
+ with zipfile.ZipFile(str(temp_downloaded_path), "r") as zip_ref:
367
386
  if file_pattern:
368
387
  import re
369
388
 
@@ -385,9 +404,24 @@ class GHSLDataDownloader(BaseHandlerDownloader):
385
404
  Path(temp_file.name).unlink()
386
405
  return extracted_files
387
406
 
407
+ except requests.exceptions.RequestException as e:
408
+ self.logger.error(f"Failed to download {url} to temporary file: {e}")
409
+ return None
410
+ except zipfile.BadZipFile:
411
+ self.logger.error(f"Downloaded file for {tile_id} is not a valid zip file.")
412
+ return None
388
413
  except Exception as e:
389
414
  self.logger.error(f"Error downloading/extracting tile {tile_id}: {e}")
390
415
  return None
416
+ finally:
417
+ if temp_downloaded_path and temp_downloaded_path.exists():
418
+ try:
419
+ temp_downloaded_path.unlink()
420
+ self.logger.debug(f"Deleted temporary file: {temp_downloaded_path}")
421
+ except OSError as e:
422
+ self.logger.warning(
423
+ f"Could not delete temporary file {temp_downloaded_path}: {e}"
424
+ )
391
425
 
392
426
  def download_data_units(
393
427
  self,
@@ -401,7 +435,7 @@ class GHSLDataDownloader(BaseHandlerDownloader):
401
435
 
402
436
  Args:
403
437
  tile_ids: A list of tile IDs to download.
404
- extract: If True and the downloaded files are zips, extract their contents. Defaults to False.
438
+ extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
405
439
  file_pattern: Optional regex pattern to filter extracted files (if extract=True).
406
440
  **kwargs: Additional parameters passed to download methods
407
441
 
@@ -456,7 +490,7 @@ class GHSLDataDownloader(BaseHandlerDownloader):
456
490
  - A list of (latitude, longitude) tuples or Shapely Point objects.
457
491
  - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
458
492
  - A GeoDataFrame with geometry column in EPSG:4326.
459
- extract: If True and the downloaded files are zips, extract their contents. Defaults to False.
493
+ extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
460
494
  file_pattern: Optional regex pattern to filter extracted files (if extract=True).
461
495
  **kwargs: Additional keyword arguments. These will be passed down to
462
496
  `AdminBoundaries.create()` (if `source` is a country)
@@ -496,7 +530,7 @@ class GHSLDataDownloader(BaseHandlerDownloader):
496
530
  country_geom_path: Optional path to a GeoJSON file containing the
497
531
  country boundary. If provided, this boundary is used
498
532
  instead of the default from `AdminBoundaries`.
499
- extract: If True and the downloaded files are zips, extract their contents. Defaults to False.
533
+ extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
500
534
  file_pattern: Optional regex pattern to filter extracted files (if extract=True).
501
535
  **kwargs: Additional keyword arguments that are passed to
502
536
  `download_data_units`. For example, `extract` to download and extract.
@@ -563,7 +597,7 @@ class GHSLDataReader(BaseHandlerReader):
563
597
  logger: Optional[logging.Logger] = None,
564
598
  ):
565
599
  """
566
- Initialize the downloader.
600
+ Initialize the reader.
567
601
 
568
602
  Args:
569
603
  config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
@@ -770,3 +804,46 @@ class GHSLDataHandler(BaseHandler):
770
804
  return pd.concat(
771
805
  [tp.to_dataframe() for tp in tif_processors], ignore_index=True
772
806
  )
807
+
808
+ def load_into_geodataframe(
809
+ self,
810
+ source: Union[
811
+ str, # country
812
+ List[Union[tuple, Point]], # points
813
+ BaseGeometry, # geometry
814
+ gpd.GeoDataFrame, # geodataframe
815
+ Path, # path
816
+ List[Union[str, Path]], # list of paths
817
+ ],
818
+ ensure_available: bool = True,
819
+ **kwargs,
820
+ ) -> gpd.GeoDataFrame:
821
+ """
822
+ Load GHSL data into a geopandas GeoDataFrame.
823
+
824
+ Args:
825
+ source: The data source specification
826
+ ensure_available: If True, ensure data is downloaded before loading
827
+ **kwargs: Additional parameters passed to load methods
828
+
829
+ Returns:
830
+ GeoDataFrame containing the GHSL data
831
+ """
832
+ tif_processors = self.load_data(
833
+ source=source, ensure_available=ensure_available, **kwargs
834
+ )
835
+ return pd.concat(
836
+ [tp.to_geodataframe() for tp in tif_processors], ignore_index=True
837
+ )
838
+
839
+ def get_available_data_info(
840
+ self,
841
+ source: Union[
842
+ str, # country
843
+ List[Union[tuple, Point]], # points
844
+ BaseGeometry, # geometry
845
+ gpd.GeoDataFrame, # geodataframe
846
+ ],
847
+ **kwargs,
848
+ ) -> dict:
849
+ return super().get_available_data_info(source, file_ext=".tif", **kwargs)
@@ -2,6 +2,7 @@ import logging
2
2
  from typing import List, Optional, Union, Literal
3
3
  from pydantic.dataclasses import dataclass
4
4
  from datetime import datetime
5
+ import pycountry
5
6
 
6
7
  from hdx.data.resource import Resource
7
8
 
@@ -36,8 +37,10 @@ class RWIConfig(HDXConfig):
36
37
  self, country: str, **kwargs
37
38
  ) -> List[Resource]:
38
39
  """Get relevant data units for a country, optionally filtering for latest version"""
39
- resources = super().get_relevant_data_units_by_country(
40
- country=country, key="url"
40
+ country = pycountry.countries.lookup(country)
41
+ values = [country.alpha_3]
42
+ resources = self.get_dataset_resources(
43
+ filter={"url": values},
41
44
  )
42
45
 
43
46
  if self.latest_only and len(resources) > 1: