giga-spatial 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. giga_spatial-0.6.0.dist-info/METADATA +141 -0
  2. giga_spatial-0.6.0.dist-info/RECORD +47 -0
  3. giga_spatial-0.6.0.dist-info/WHEEL +5 -0
  4. giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
  5. giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
  6. gigaspatial/__init__.py +1 -0
  7. gigaspatial/config.py +226 -0
  8. gigaspatial/core/__init__.py +0 -0
  9. gigaspatial/core/io/__init__.py +5 -0
  10. gigaspatial/core/io/adls_data_store.py +325 -0
  11. gigaspatial/core/io/data_api.py +113 -0
  12. gigaspatial/core/io/data_store.py +147 -0
  13. gigaspatial/core/io/local_data_store.py +92 -0
  14. gigaspatial/core/io/readers.py +265 -0
  15. gigaspatial/core/io/writers.py +128 -0
  16. gigaspatial/core/schemas/__init__.py +0 -0
  17. gigaspatial/core/schemas/entity.py +244 -0
  18. gigaspatial/generators/__init__.py +2 -0
  19. gigaspatial/generators/poi.py +636 -0
  20. gigaspatial/generators/zonal/__init__.py +3 -0
  21. gigaspatial/generators/zonal/base.py +370 -0
  22. gigaspatial/generators/zonal/geometry.py +439 -0
  23. gigaspatial/generators/zonal/mercator.py +78 -0
  24. gigaspatial/grid/__init__.py +1 -0
  25. gigaspatial/grid/mercator_tiles.py +286 -0
  26. gigaspatial/handlers/__init__.py +40 -0
  27. gigaspatial/handlers/base.py +761 -0
  28. gigaspatial/handlers/boundaries.py +305 -0
  29. gigaspatial/handlers/ghsl.py +772 -0
  30. gigaspatial/handlers/giga.py +145 -0
  31. gigaspatial/handlers/google_open_buildings.py +472 -0
  32. gigaspatial/handlers/hdx.py +241 -0
  33. gigaspatial/handlers/mapbox_image.py +208 -0
  34. gigaspatial/handlers/maxar_image.py +291 -0
  35. gigaspatial/handlers/microsoft_global_buildings.py +548 -0
  36. gigaspatial/handlers/ookla_speedtest.py +199 -0
  37. gigaspatial/handlers/opencellid.py +290 -0
  38. gigaspatial/handlers/osm.py +356 -0
  39. gigaspatial/handlers/overture.py +126 -0
  40. gigaspatial/handlers/rwi.py +157 -0
  41. gigaspatial/handlers/unicef_georepo.py +806 -0
  42. gigaspatial/handlers/worldpop.py +266 -0
  43. gigaspatial/processing/__init__.py +4 -0
  44. gigaspatial/processing/geo.py +1054 -0
  45. gigaspatial/processing/sat_images.py +39 -0
  46. gigaspatial/processing/tif_processor.py +477 -0
  47. gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,241 @@
1
+ import os
2
+ import logging
3
+ from pathlib import Path
4
+ from typing import List, Optional, Union, Dict, Any
5
+ import tempfile
6
+
7
+ import pandas as pd
8
+ import geopandas as gpd
9
+ from pydantic import BaseModel, Field
10
+
11
+ from hdx.api.configuration import Configuration
12
+ from hdx.data.dataset import Dataset
13
+ from hdx.data.resource import Resource
14
+
15
+ from gigaspatial.core.io.data_store import DataStore
16
+ from gigaspatial.core.io.local_data_store import LocalDataStore
17
+ from gigaspatial.core.io.readers import read_dataset
18
+ from gigaspatial.config import config as global_config
19
+
20
+
21
+ class HDXConfig(BaseModel):
22
+ """Configuration for HDX data access"""
23
+
24
+ # User configuration
25
+ dataset_name: str = Field(..., description="Name of the HDX dataset to download")
26
+ base_path: Path = Field(default=global_config.get_path("hdx", "bronze"))
27
+ user_agent: str = Field(
28
+ default="gigaspatial", description="User agent for HDX API requests"
29
+ )
30
+ hdx_site: str = Field(default="prod", description="HDX site to use (prod or test)")
31
+ resource_filter: Optional[Dict[str, Any]] = Field(
32
+ default=None, description="Filter to apply to resources"
33
+ )
34
+
35
+ @property
36
+ def output_dir_path(self) -> Path:
37
+ """Path to save the downloaded HDX dataset"""
38
+ return self.base_path / self.dataset_name
39
+
40
+ def __repr__(self) -> str:
41
+ return (
42
+ f"HDXConfig(\n"
43
+ f" dataset_name='{self.dataset_name}'\n"
44
+ f" base_path='{self.base_path}'\n"
45
+ f" hdx_site='{self.hdx_site}'\n"
46
+ f" user_agent='{self.user_agent}'\n"
47
+ f")"
48
+ )
49
+
50
+
51
+ class HDXDownloader:
52
+ """Downloader for HDX datasets"""
53
+
54
+ def __init__(
55
+ self,
56
+ config: Union[HDXConfig, dict],
57
+ data_store: Optional[DataStore] = None,
58
+ logger: Optional[logging.Logger] = None,
59
+ ):
60
+ if isinstance(config, dict):
61
+ self.config = HDXConfig(**config)
62
+ else:
63
+ self.config = config
64
+
65
+ self.data_store = data_store or LocalDataStore()
66
+ self.logger = logger or global_config.get_logger(self.__class__.__name__)
67
+ try:
68
+ Configuration.read()
69
+ self._hdx_configured = True
70
+ except:
71
+ self._hdx_configured = False
72
+
73
+ @classmethod
74
+ def from_dataset_name(cls, dataset_name: str, **kwargs):
75
+ """Create a downloader for a specific HDX dataset"""
76
+ config = HDXConfig(dataset_name=dataset_name, **kwargs)
77
+ return cls(config=config)
78
+
79
+ def _configure_hdx(self):
80
+ """Configure HDX API if not already configured"""
81
+ if not self._hdx_configured:
82
+ try:
83
+ Configuration.create(
84
+ hdx_site=self.config.hdx_site,
85
+ user_agent=self.config.user_agent,
86
+ hdx_read_only=True,
87
+ )
88
+ self._hdx_configured = True
89
+ except Exception as e:
90
+ self.logger.error(f"Error configuring HDX API: {str(e)}")
91
+ raise
92
+
93
+ def get_dataset(self) -> Dataset:
94
+ """Get the HDX dataset"""
95
+ self._configure_hdx()
96
+
97
+ try:
98
+ self.logger.info(f"Fetching HDX dataset: {self.config.dataset_name}")
99
+ dataset = Dataset.read_from_hdx(self.config.dataset_name)
100
+ if not dataset:
101
+ raise ValueError(
102
+ f"Dataset '{self.config.dataset_name}' not found on HDX"
103
+ )
104
+ return dataset
105
+ except Exception as e:
106
+ self.logger.error(f"Error fetching HDX dataset: {str(e)}")
107
+ raise
108
+
109
+ def get_dataset_resources(
110
+ self, dataset: Optional[Dataset] = None
111
+ ) -> List[Resource]:
112
+ """Get resources from the HDX dataset"""
113
+ dataset = dataset or self.get_dataset()
114
+
115
+ try:
116
+ resources = dataset.get_resources()
117
+
118
+ # Apply resource filter if specified
119
+ if self.config.resource_filter:
120
+ filtered_resources = []
121
+ for res in resources:
122
+ match = True
123
+ for key, value in self.config.resource_filter.items():
124
+ if key in res.data and res.data[key] != value:
125
+ match = False
126
+ break
127
+ if match:
128
+ filtered_resources.append(res)
129
+ resources = filtered_resources
130
+
131
+ return resources
132
+ except Exception as e:
133
+ self.logger.error(f"Error getting dataset resources: {str(e)}")
134
+ raise
135
+
136
+ def download_dataset(self) -> List[str]:
137
+ """Download and save all resources from the HDX dataset into the data_store."""
138
+ try:
139
+ dataset = self.get_dataset()
140
+ resources = self.get_dataset_resources(dataset)
141
+
142
+ if not resources:
143
+ self.logger.warning(
144
+ f"No resources found for dataset '{self.config.dataset_name}'"
145
+ )
146
+ return []
147
+
148
+ self.logger.info(
149
+ f"Found {len(resources)} resource(s) for dataset '{self.config.dataset_name}'"
150
+ )
151
+
152
+ downloaded_paths = []
153
+ for res in resources:
154
+ try:
155
+ resource_name = res.get("name", "Unknown")
156
+ self.logger.info(f"Downloading resource: {resource_name}")
157
+
158
+ # Download to a temporary directory
159
+ with tempfile.TemporaryDirectory() as tmpdir:
160
+ url, local_path = res.download(folder=tmpdir)
161
+ # Read the file and write to the DataStore
162
+ with open(local_path, "rb") as f:
163
+ data = f.read()
164
+ # Compose the target path in the DataStore
165
+ target_path = str(
166
+ self.config.output_dir_path / Path(local_path).name
167
+ )
168
+ self.data_store.write_file(target_path, data)
169
+ downloaded_paths.append(target_path)
170
+
171
+ self.logger.info(
172
+ f"Downloaded resource: {resource_name} to {target_path}"
173
+ )
174
+ except Exception as e:
175
+ self.logger.error(
176
+ f"Error downloading resource {res.get('name', 'Unknown')}: {str(e)}"
177
+ )
178
+
179
+ return downloaded_paths
180
+
181
+ except Exception as e:
182
+ self.logger.error(f"Error downloading dataset: {str(e)}")
183
+ raise
184
+
185
+
186
+ class HDXReader:
187
+ """Reader for HDX datasets"""
188
+
189
+ def __init__(
190
+ self,
191
+ dataset_name: str,
192
+ data_store: Optional[DataStore] = None,
193
+ base_path: Optional[Path] = None,
194
+ ):
195
+ self.dataset_name = dataset_name
196
+ self.data_store = data_store or LocalDataStore()
197
+ self.base_path = base_path or global_config.get_path("hdx", "bronze")
198
+ self.dataset_path = self.base_path / self.dataset_name
199
+
200
+ def list_resources(self) -> List[str]:
201
+ """List all resources in the dataset directory using the data_store."""
202
+ # Check if the dataset directory exists in the data_store
203
+ if not (
204
+ self.data_store.is_dir(str(self.dataset_path))
205
+ or self.data_store.file_exists(str(self.dataset_path))
206
+ ):
207
+ raise FileNotFoundError(
208
+ f"HDX dataset '{self.dataset_name}' not found at {self.dataset_path}. "
209
+ "Download the data first using HDXDownloader."
210
+ )
211
+ # List files using the data_store
212
+ return self.data_store.list_files(str(self.dataset_path))
213
+
214
+ def read_resource(
215
+ self, resource_file: str
216
+ ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
217
+ """Read a specific resource file from the dataset using the data_store."""
218
+ file_path = str(self.dataset_path / resource_file)
219
+
220
+ if not self.data_store.file_exists(file_path):
221
+ raise FileNotFoundError(
222
+ f"Resource file {resource_file} not found in dataset {self.dataset_name}"
223
+ )
224
+
225
+ try:
226
+ return read_dataset(self.data_store, file_path)
227
+ except Exception as e:
228
+ raise ValueError(f"Could not read file {file_path}: {str(e)}")
229
+
230
+ def read_all_resources(self) -> Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame]]:
231
+ """Read all resources in the dataset directory using the data_store."""
232
+ resources = self.list_resources()
233
+ result = {}
234
+
235
+ for resource in resources:
236
+ try:
237
+ result[resource] = self.read_resource(resource)
238
+ except Exception as e:
239
+ logging.warning(f"Could not read resource {resource}: {str(e)}")
240
+
241
+ return result
@@ -0,0 +1,208 @@
1
+ from typing import Iterable, Optional, Tuple, List, Union, Any
2
+ import requests
3
+ from pathlib import Path
4
+ import mercantile
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from tqdm import tqdm
7
+ import geopandas as gpd
8
+ import pandas as pd
9
+
10
+ from gigaspatial.grid.mercator_tiles import MercatorTiles
11
+ from gigaspatial.processing.geo import convert_to_geodataframe, buffer_geodataframe
12
+ from gigaspatial.core.io.data_store import DataStore
13
+ from gigaspatial.core.io.local_data_store import LocalDataStore
14
+ from gigaspatial.config import config
15
+
16
+
17
+ class MapboxImageDownloader:
18
+ """Class to download images from Mapbox Static Images API using a specific style"""
19
+
20
+ BASE_URL = "https://api.mapbox.com/styles/v1"
21
+
22
+ def __init__(
23
+ self,
24
+ access_token: str = config.MAPBOX_ACCESS_TOKEN,
25
+ style_id: Optional[str] = None,
26
+ data_store: Optional[DataStore] = None,
27
+ ):
28
+ """
29
+ Initialize the downloader with Mapbox credentials
30
+
31
+ Args:
32
+ access_token: Mapbox access token
33
+ style_id: Mapbox style ID to use for image download
34
+ data_store: Instance of DataStore for accessing data storage
35
+ """
36
+ self.access_token = access_token
37
+ self.style_id = style_id if style_id else "mapbox/satellite-v9"
38
+ self.data_store = data_store or LocalDataStore()
39
+ self.logger = config.get_logger(self.__class__.__name__)
40
+
41
+ def _construct_url(self, bounds: Iterable[float], image_size: str) -> str:
42
+ """Construct the Mapbox Static Images API URL"""
43
+ bounds_str = f"[{','.join(map(str, bounds))}]"
44
+
45
+ return (
46
+ f"{self.BASE_URL}/{self.style_id}/static/{bounds_str}/{image_size}"
47
+ f"?access_token={self.access_token}&attribution=false&logo=false"
48
+ )
49
+
50
+ def _download_single_image(self, url: str, output_path: Path) -> bool:
51
+ """Download a single image from URL"""
52
+ try:
53
+ response = requests.get(url)
54
+ response.raise_for_status()
55
+
56
+ with self.data_store.open(str(output_path), "wb") as f:
57
+ f.write(response.content)
58
+ return True
59
+ except Exception as e:
60
+ self.logger.warning(f"Error downloading {output_path.name}: {str(e)}")
61
+ return False
62
+
63
+ def download_images_by_tiles(
64
+ self,
65
+ mercator_tiles: "MercatorTiles",
66
+ output_dir: Union[str, Path],
67
+ image_size: Tuple[int, int] = (512, 512),
68
+ max_workers: int = 4,
69
+ image_prefix: str = "image_",
70
+ ) -> None:
71
+ """
72
+ Download images for given mercator tiles using the specified style
73
+
74
+ Args:
75
+ mercator_tiles: MercatorTiles instance containing quadkeys
76
+ output_dir: Directory to save images
77
+ image_size: Tuple of (width, height) for output images
78
+ max_workers: Maximum number of concurrent downloads
79
+ image_prefix: Prefix for output image names
80
+ """
81
+ output_dir = Path(output_dir)
82
+ # self.data_store.makedirs(str(output_dir), exist_ok=True)
83
+
84
+ image_size_str = f"{image_size[0]}x{image_size[1]}"
85
+ total_tiles = len(mercator_tiles.quadkeys)
86
+
87
+ self.logger.info(
88
+ f"Downloading {total_tiles} tiles with size {image_size_str}..."
89
+ )
90
+
91
+ def _get_tile_bounds(quadkey: str) -> List[float]:
92
+ """Get tile bounds from quadkey"""
93
+ tile = mercantile.quadkey_to_tile(quadkey)
94
+ bounds = mercantile.bounds(tile)
95
+ return [bounds.west, bounds.south, bounds.east, bounds.north]
96
+
97
+ def download_image(quadkey: str) -> bool:
98
+ bounds = _get_tile_bounds(quadkey)
99
+ file_name = f"{image_prefix}{quadkey}.png"
100
+
101
+ url = self._construct_url(bounds, image_size_str)
102
+ success = self._download_single_image(url, output_dir / file_name)
103
+
104
+ return success
105
+
106
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
107
+ futures = [
108
+ executor.submit(download_image, quadkey)
109
+ for quadkey in mercator_tiles.quadkeys
110
+ ]
111
+
112
+ successful_downloads = 0
113
+ with tqdm(total=total_tiles) as pbar:
114
+ for future in as_completed(futures):
115
+ if future.result():
116
+ successful_downloads += 1
117
+ pbar.update(1)
118
+
119
+ self.logger.info(
120
+ f"Successfully downloaded {successful_downloads}/{total_tiles} images!"
121
+ )
122
+
123
+ def download_images_by_bounds(
124
+ self,
125
+ gdf: gpd.GeoDataFrame,
126
+ output_dir: Union[str, Path],
127
+ image_size: Tuple[int, int] = (512, 512),
128
+ max_workers: int = 4,
129
+ image_prefix: str = "image_",
130
+ ) -> None:
131
+ """
132
+ Download images for given points using the specified style
133
+
134
+ Args:
135
+ gdf_points: GeoDataFrame containing bounding box polygons
136
+ output_dir: Directory to save images
137
+ image_size: Tuple of (width, height) for output images
138
+ max_workers: Maximum number of concurrent downloads
139
+ image_prefix: Prefix for output image names
140
+ """
141
+ output_dir = Path(output_dir)
142
+ # self.data_store.makedirs(str(output_dir), exist_ok=True)
143
+
144
+ image_size_str = f"{image_size[0]}x{image_size[1]}"
145
+ total_images = len(gdf)
146
+
147
+ self.logger.info(
148
+ f"Downloading {total_images} images with size {image_size_str}..."
149
+ )
150
+
151
+ def download_image(idx: Any, bounds: Tuple[float, float, float, float]) -> bool:
152
+ file_name = f"{image_prefix}{idx}.png"
153
+ url = self._construct_url(bounds, image_size_str)
154
+ success = self._download_single_image(url, output_dir / file_name)
155
+ return success
156
+
157
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
158
+ futures = [
159
+ executor.submit(download_image, row.Index, row.geometry.bounds)
160
+ for row in gdf.itertuples()
161
+ ]
162
+
163
+ successful_downloads = 0
164
+ with tqdm(total=total_images) as pbar:
165
+ for future in as_completed(futures):
166
+ if future.result():
167
+ successful_downloads += 1
168
+ pbar.update(1)
169
+
170
+ self.logger.info(
171
+ f"Successfully downloaded {successful_downloads}/{total_images} images!"
172
+ )
173
+
174
+ def download_images_by_coordinates(
175
+ self,
176
+ data: Union[pd.DataFrame, List[Tuple[float, float]]],
177
+ res_meters_pixel: float,
178
+ output_dir: Union[str, Path],
179
+ image_size: Tuple[int, int] = (512, 512),
180
+ max_workers: int = 4,
181
+ image_prefix: str = "image_",
182
+ ) -> None:
183
+ """
184
+ Download images for given coordinates by creating bounded boxes around points
185
+
186
+ Args:
187
+ data: Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples
188
+ res_meters_pixel: Size of the bounding box in meters (creates a square)
189
+ output_dir: Directory to save images
190
+ image_size: Tuple of (width, height) for output images
191
+ max_workers: Maximum number of concurrent downloads
192
+ image_prefix: Prefix for output image names
193
+ """
194
+
195
+ if isinstance(data, pd.DataFrame):
196
+ coordinates_df = data
197
+ else:
198
+ coordinates_df = pd.DataFrame(data, columns=["latitude", "longitude"])
199
+
200
+ gdf = convert_to_geodataframe(coordinates_df)
201
+
202
+ buffered_gdf = buffer_geodataframe(
203
+ gdf, res_meters_pixel / 2, cap_style="square"
204
+ )
205
+
206
+ self.download_images_by_bounds(
207
+ buffered_gdf, output_dir, image_size, max_workers, image_prefix
208
+ )