giga-spatial 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- giga_spatial-0.6.0.dist-info/METADATA +141 -0
- giga_spatial-0.6.0.dist-info/RECORD +47 -0
- giga_spatial-0.6.0.dist-info/WHEEL +5 -0
- giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
- giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
- gigaspatial/__init__.py +1 -0
- gigaspatial/config.py +226 -0
- gigaspatial/core/__init__.py +0 -0
- gigaspatial/core/io/__init__.py +5 -0
- gigaspatial/core/io/adls_data_store.py +325 -0
- gigaspatial/core/io/data_api.py +113 -0
- gigaspatial/core/io/data_store.py +147 -0
- gigaspatial/core/io/local_data_store.py +92 -0
- gigaspatial/core/io/readers.py +265 -0
- gigaspatial/core/io/writers.py +128 -0
- gigaspatial/core/schemas/__init__.py +0 -0
- gigaspatial/core/schemas/entity.py +244 -0
- gigaspatial/generators/__init__.py +2 -0
- gigaspatial/generators/poi.py +636 -0
- gigaspatial/generators/zonal/__init__.py +3 -0
- gigaspatial/generators/zonal/base.py +370 -0
- gigaspatial/generators/zonal/geometry.py +439 -0
- gigaspatial/generators/zonal/mercator.py +78 -0
- gigaspatial/grid/__init__.py +1 -0
- gigaspatial/grid/mercator_tiles.py +286 -0
- gigaspatial/handlers/__init__.py +40 -0
- gigaspatial/handlers/base.py +761 -0
- gigaspatial/handlers/boundaries.py +305 -0
- gigaspatial/handlers/ghsl.py +772 -0
- gigaspatial/handlers/giga.py +145 -0
- gigaspatial/handlers/google_open_buildings.py +472 -0
- gigaspatial/handlers/hdx.py +241 -0
- gigaspatial/handlers/mapbox_image.py +208 -0
- gigaspatial/handlers/maxar_image.py +291 -0
- gigaspatial/handlers/microsoft_global_buildings.py +548 -0
- gigaspatial/handlers/ookla_speedtest.py +199 -0
- gigaspatial/handlers/opencellid.py +290 -0
- gigaspatial/handlers/osm.py +356 -0
- gigaspatial/handlers/overture.py +126 -0
- gigaspatial/handlers/rwi.py +157 -0
- gigaspatial/handlers/unicef_georepo.py +806 -0
- gigaspatial/handlers/worldpop.py +266 -0
- gigaspatial/processing/__init__.py +4 -0
- gigaspatial/processing/geo.py +1054 -0
- gigaspatial/processing/sat_images.py +39 -0
- gigaspatial/processing/tif_processor.py +477 -0
- gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,241 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import List, Optional, Union, Dict, Any
|
5
|
+
import tempfile
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import geopandas as gpd
|
9
|
+
from pydantic import BaseModel, Field
|
10
|
+
|
11
|
+
from hdx.api.configuration import Configuration
|
12
|
+
from hdx.data.dataset import Dataset
|
13
|
+
from hdx.data.resource import Resource
|
14
|
+
|
15
|
+
from gigaspatial.core.io.data_store import DataStore
|
16
|
+
from gigaspatial.core.io.local_data_store import LocalDataStore
|
17
|
+
from gigaspatial.core.io.readers import read_dataset
|
18
|
+
from gigaspatial.config import config as global_config
|
19
|
+
|
20
|
+
|
21
|
+
class HDXConfig(BaseModel):
|
22
|
+
"""Configuration for HDX data access"""
|
23
|
+
|
24
|
+
# User configuration
|
25
|
+
dataset_name: str = Field(..., description="Name of the HDX dataset to download")
|
26
|
+
base_path: Path = Field(default=global_config.get_path("hdx", "bronze"))
|
27
|
+
user_agent: str = Field(
|
28
|
+
default="gigaspatial", description="User agent for HDX API requests"
|
29
|
+
)
|
30
|
+
hdx_site: str = Field(default="prod", description="HDX site to use (prod or test)")
|
31
|
+
resource_filter: Optional[Dict[str, Any]] = Field(
|
32
|
+
default=None, description="Filter to apply to resources"
|
33
|
+
)
|
34
|
+
|
35
|
+
@property
|
36
|
+
def output_dir_path(self) -> Path:
|
37
|
+
"""Path to save the downloaded HDX dataset"""
|
38
|
+
return self.base_path / self.dataset_name
|
39
|
+
|
40
|
+
def __repr__(self) -> str:
|
41
|
+
return (
|
42
|
+
f"HDXConfig(\n"
|
43
|
+
f" dataset_name='{self.dataset_name}'\n"
|
44
|
+
f" base_path='{self.base_path}'\n"
|
45
|
+
f" hdx_site='{self.hdx_site}'\n"
|
46
|
+
f" user_agent='{self.user_agent}'\n"
|
47
|
+
f")"
|
48
|
+
)
|
49
|
+
|
50
|
+
|
51
|
+
class HDXDownloader:
|
52
|
+
"""Downloader for HDX datasets"""
|
53
|
+
|
54
|
+
def __init__(
|
55
|
+
self,
|
56
|
+
config: Union[HDXConfig, dict],
|
57
|
+
data_store: Optional[DataStore] = None,
|
58
|
+
logger: Optional[logging.Logger] = None,
|
59
|
+
):
|
60
|
+
if isinstance(config, dict):
|
61
|
+
self.config = HDXConfig(**config)
|
62
|
+
else:
|
63
|
+
self.config = config
|
64
|
+
|
65
|
+
self.data_store = data_store or LocalDataStore()
|
66
|
+
self.logger = logger or global_config.get_logger(self.__class__.__name__)
|
67
|
+
try:
|
68
|
+
Configuration.read()
|
69
|
+
self._hdx_configured = True
|
70
|
+
except:
|
71
|
+
self._hdx_configured = False
|
72
|
+
|
73
|
+
@classmethod
|
74
|
+
def from_dataset_name(cls, dataset_name: str, **kwargs):
|
75
|
+
"""Create a downloader for a specific HDX dataset"""
|
76
|
+
config = HDXConfig(dataset_name=dataset_name, **kwargs)
|
77
|
+
return cls(config=config)
|
78
|
+
|
79
|
+
def _configure_hdx(self):
|
80
|
+
"""Configure HDX API if not already configured"""
|
81
|
+
if not self._hdx_configured:
|
82
|
+
try:
|
83
|
+
Configuration.create(
|
84
|
+
hdx_site=self.config.hdx_site,
|
85
|
+
user_agent=self.config.user_agent,
|
86
|
+
hdx_read_only=True,
|
87
|
+
)
|
88
|
+
self._hdx_configured = True
|
89
|
+
except Exception as e:
|
90
|
+
self.logger.error(f"Error configuring HDX API: {str(e)}")
|
91
|
+
raise
|
92
|
+
|
93
|
+
def get_dataset(self) -> Dataset:
|
94
|
+
"""Get the HDX dataset"""
|
95
|
+
self._configure_hdx()
|
96
|
+
|
97
|
+
try:
|
98
|
+
self.logger.info(f"Fetching HDX dataset: {self.config.dataset_name}")
|
99
|
+
dataset = Dataset.read_from_hdx(self.config.dataset_name)
|
100
|
+
if not dataset:
|
101
|
+
raise ValueError(
|
102
|
+
f"Dataset '{self.config.dataset_name}' not found on HDX"
|
103
|
+
)
|
104
|
+
return dataset
|
105
|
+
except Exception as e:
|
106
|
+
self.logger.error(f"Error fetching HDX dataset: {str(e)}")
|
107
|
+
raise
|
108
|
+
|
109
|
+
def get_dataset_resources(
|
110
|
+
self, dataset: Optional[Dataset] = None
|
111
|
+
) -> List[Resource]:
|
112
|
+
"""Get resources from the HDX dataset"""
|
113
|
+
dataset = dataset or self.get_dataset()
|
114
|
+
|
115
|
+
try:
|
116
|
+
resources = dataset.get_resources()
|
117
|
+
|
118
|
+
# Apply resource filter if specified
|
119
|
+
if self.config.resource_filter:
|
120
|
+
filtered_resources = []
|
121
|
+
for res in resources:
|
122
|
+
match = True
|
123
|
+
for key, value in self.config.resource_filter.items():
|
124
|
+
if key in res.data and res.data[key] != value:
|
125
|
+
match = False
|
126
|
+
break
|
127
|
+
if match:
|
128
|
+
filtered_resources.append(res)
|
129
|
+
resources = filtered_resources
|
130
|
+
|
131
|
+
return resources
|
132
|
+
except Exception as e:
|
133
|
+
self.logger.error(f"Error getting dataset resources: {str(e)}")
|
134
|
+
raise
|
135
|
+
|
136
|
+
def download_dataset(self) -> List[str]:
|
137
|
+
"""Download and save all resources from the HDX dataset into the data_store."""
|
138
|
+
try:
|
139
|
+
dataset = self.get_dataset()
|
140
|
+
resources = self.get_dataset_resources(dataset)
|
141
|
+
|
142
|
+
if not resources:
|
143
|
+
self.logger.warning(
|
144
|
+
f"No resources found for dataset '{self.config.dataset_name}'"
|
145
|
+
)
|
146
|
+
return []
|
147
|
+
|
148
|
+
self.logger.info(
|
149
|
+
f"Found {len(resources)} resource(s) for dataset '{self.config.dataset_name}'"
|
150
|
+
)
|
151
|
+
|
152
|
+
downloaded_paths = []
|
153
|
+
for res in resources:
|
154
|
+
try:
|
155
|
+
resource_name = res.get("name", "Unknown")
|
156
|
+
self.logger.info(f"Downloading resource: {resource_name}")
|
157
|
+
|
158
|
+
# Download to a temporary directory
|
159
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
160
|
+
url, local_path = res.download(folder=tmpdir)
|
161
|
+
# Read the file and write to the DataStore
|
162
|
+
with open(local_path, "rb") as f:
|
163
|
+
data = f.read()
|
164
|
+
# Compose the target path in the DataStore
|
165
|
+
target_path = str(
|
166
|
+
self.config.output_dir_path / Path(local_path).name
|
167
|
+
)
|
168
|
+
self.data_store.write_file(target_path, data)
|
169
|
+
downloaded_paths.append(target_path)
|
170
|
+
|
171
|
+
self.logger.info(
|
172
|
+
f"Downloaded resource: {resource_name} to {target_path}"
|
173
|
+
)
|
174
|
+
except Exception as e:
|
175
|
+
self.logger.error(
|
176
|
+
f"Error downloading resource {res.get('name', 'Unknown')}: {str(e)}"
|
177
|
+
)
|
178
|
+
|
179
|
+
return downloaded_paths
|
180
|
+
|
181
|
+
except Exception as e:
|
182
|
+
self.logger.error(f"Error downloading dataset: {str(e)}")
|
183
|
+
raise
|
184
|
+
|
185
|
+
|
186
|
+
class HDXReader:
|
187
|
+
"""Reader for HDX datasets"""
|
188
|
+
|
189
|
+
def __init__(
|
190
|
+
self,
|
191
|
+
dataset_name: str,
|
192
|
+
data_store: Optional[DataStore] = None,
|
193
|
+
base_path: Optional[Path] = None,
|
194
|
+
):
|
195
|
+
self.dataset_name = dataset_name
|
196
|
+
self.data_store = data_store or LocalDataStore()
|
197
|
+
self.base_path = base_path or global_config.get_path("hdx", "bronze")
|
198
|
+
self.dataset_path = self.base_path / self.dataset_name
|
199
|
+
|
200
|
+
def list_resources(self) -> List[str]:
|
201
|
+
"""List all resources in the dataset directory using the data_store."""
|
202
|
+
# Check if the dataset directory exists in the data_store
|
203
|
+
if not (
|
204
|
+
self.data_store.is_dir(str(self.dataset_path))
|
205
|
+
or self.data_store.file_exists(str(self.dataset_path))
|
206
|
+
):
|
207
|
+
raise FileNotFoundError(
|
208
|
+
f"HDX dataset '{self.dataset_name}' not found at {self.dataset_path}. "
|
209
|
+
"Download the data first using HDXDownloader."
|
210
|
+
)
|
211
|
+
# List files using the data_store
|
212
|
+
return self.data_store.list_files(str(self.dataset_path))
|
213
|
+
|
214
|
+
def read_resource(
|
215
|
+
self, resource_file: str
|
216
|
+
) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
|
217
|
+
"""Read a specific resource file from the dataset using the data_store."""
|
218
|
+
file_path = str(self.dataset_path / resource_file)
|
219
|
+
|
220
|
+
if not self.data_store.file_exists(file_path):
|
221
|
+
raise FileNotFoundError(
|
222
|
+
f"Resource file {resource_file} not found in dataset {self.dataset_name}"
|
223
|
+
)
|
224
|
+
|
225
|
+
try:
|
226
|
+
return read_dataset(self.data_store, file_path)
|
227
|
+
except Exception as e:
|
228
|
+
raise ValueError(f"Could not read file {file_path}: {str(e)}")
|
229
|
+
|
230
|
+
def read_all_resources(self) -> Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame]]:
|
231
|
+
"""Read all resources in the dataset directory using the data_store."""
|
232
|
+
resources = self.list_resources()
|
233
|
+
result = {}
|
234
|
+
|
235
|
+
for resource in resources:
|
236
|
+
try:
|
237
|
+
result[resource] = self.read_resource(resource)
|
238
|
+
except Exception as e:
|
239
|
+
logging.warning(f"Could not read resource {resource}: {str(e)}")
|
240
|
+
|
241
|
+
return result
|
@@ -0,0 +1,208 @@
|
|
1
|
+
from typing import Iterable, Optional, Tuple, List, Union, Any
|
2
|
+
import requests
|
3
|
+
from pathlib import Path
|
4
|
+
import mercantile
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6
|
+
from tqdm import tqdm
|
7
|
+
import geopandas as gpd
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
from gigaspatial.grid.mercator_tiles import MercatorTiles
|
11
|
+
from gigaspatial.processing.geo import convert_to_geodataframe, buffer_geodataframe
|
12
|
+
from gigaspatial.core.io.data_store import DataStore
|
13
|
+
from gigaspatial.core.io.local_data_store import LocalDataStore
|
14
|
+
from gigaspatial.config import config
|
15
|
+
|
16
|
+
|
17
|
+
class MapboxImageDownloader:
|
18
|
+
"""Class to download images from Mapbox Static Images API using a specific style"""
|
19
|
+
|
20
|
+
BASE_URL = "https://api.mapbox.com/styles/v1"
|
21
|
+
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
access_token: str = config.MAPBOX_ACCESS_TOKEN,
|
25
|
+
style_id: Optional[str] = None,
|
26
|
+
data_store: Optional[DataStore] = None,
|
27
|
+
):
|
28
|
+
"""
|
29
|
+
Initialize the downloader with Mapbox credentials
|
30
|
+
|
31
|
+
Args:
|
32
|
+
access_token: Mapbox access token
|
33
|
+
style_id: Mapbox style ID to use for image download
|
34
|
+
data_store: Instance of DataStore for accessing data storage
|
35
|
+
"""
|
36
|
+
self.access_token = access_token
|
37
|
+
self.style_id = style_id if style_id else "mapbox/satellite-v9"
|
38
|
+
self.data_store = data_store or LocalDataStore()
|
39
|
+
self.logger = config.get_logger(self.__class__.__name__)
|
40
|
+
|
41
|
+
def _construct_url(self, bounds: Iterable[float], image_size: str) -> str:
|
42
|
+
"""Construct the Mapbox Static Images API URL"""
|
43
|
+
bounds_str = f"[{','.join(map(str, bounds))}]"
|
44
|
+
|
45
|
+
return (
|
46
|
+
f"{self.BASE_URL}/{self.style_id}/static/{bounds_str}/{image_size}"
|
47
|
+
f"?access_token={self.access_token}&attribution=false&logo=false"
|
48
|
+
)
|
49
|
+
|
50
|
+
def _download_single_image(self, url: str, output_path: Path) -> bool:
|
51
|
+
"""Download a single image from URL"""
|
52
|
+
try:
|
53
|
+
response = requests.get(url)
|
54
|
+
response.raise_for_status()
|
55
|
+
|
56
|
+
with self.data_store.open(str(output_path), "wb") as f:
|
57
|
+
f.write(response.content)
|
58
|
+
return True
|
59
|
+
except Exception as e:
|
60
|
+
self.logger.warning(f"Error downloading {output_path.name}: {str(e)}")
|
61
|
+
return False
|
62
|
+
|
63
|
+
def download_images_by_tiles(
|
64
|
+
self,
|
65
|
+
mercator_tiles: "MercatorTiles",
|
66
|
+
output_dir: Union[str, Path],
|
67
|
+
image_size: Tuple[int, int] = (512, 512),
|
68
|
+
max_workers: int = 4,
|
69
|
+
image_prefix: str = "image_",
|
70
|
+
) -> None:
|
71
|
+
"""
|
72
|
+
Download images for given mercator tiles using the specified style
|
73
|
+
|
74
|
+
Args:
|
75
|
+
mercator_tiles: MercatorTiles instance containing quadkeys
|
76
|
+
output_dir: Directory to save images
|
77
|
+
image_size: Tuple of (width, height) for output images
|
78
|
+
max_workers: Maximum number of concurrent downloads
|
79
|
+
image_prefix: Prefix for output image names
|
80
|
+
"""
|
81
|
+
output_dir = Path(output_dir)
|
82
|
+
# self.data_store.makedirs(str(output_dir), exist_ok=True)
|
83
|
+
|
84
|
+
image_size_str = f"{image_size[0]}x{image_size[1]}"
|
85
|
+
total_tiles = len(mercator_tiles.quadkeys)
|
86
|
+
|
87
|
+
self.logger.info(
|
88
|
+
f"Downloading {total_tiles} tiles with size {image_size_str}..."
|
89
|
+
)
|
90
|
+
|
91
|
+
def _get_tile_bounds(quadkey: str) -> List[float]:
|
92
|
+
"""Get tile bounds from quadkey"""
|
93
|
+
tile = mercantile.quadkey_to_tile(quadkey)
|
94
|
+
bounds = mercantile.bounds(tile)
|
95
|
+
return [bounds.west, bounds.south, bounds.east, bounds.north]
|
96
|
+
|
97
|
+
def download_image(quadkey: str) -> bool:
|
98
|
+
bounds = _get_tile_bounds(quadkey)
|
99
|
+
file_name = f"{image_prefix}{quadkey}.png"
|
100
|
+
|
101
|
+
url = self._construct_url(bounds, image_size_str)
|
102
|
+
success = self._download_single_image(url, output_dir / file_name)
|
103
|
+
|
104
|
+
return success
|
105
|
+
|
106
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
107
|
+
futures = [
|
108
|
+
executor.submit(download_image, quadkey)
|
109
|
+
for quadkey in mercator_tiles.quadkeys
|
110
|
+
]
|
111
|
+
|
112
|
+
successful_downloads = 0
|
113
|
+
with tqdm(total=total_tiles) as pbar:
|
114
|
+
for future in as_completed(futures):
|
115
|
+
if future.result():
|
116
|
+
successful_downloads += 1
|
117
|
+
pbar.update(1)
|
118
|
+
|
119
|
+
self.logger.info(
|
120
|
+
f"Successfully downloaded {successful_downloads}/{total_tiles} images!"
|
121
|
+
)
|
122
|
+
|
123
|
+
def download_images_by_bounds(
|
124
|
+
self,
|
125
|
+
gdf: gpd.GeoDataFrame,
|
126
|
+
output_dir: Union[str, Path],
|
127
|
+
image_size: Tuple[int, int] = (512, 512),
|
128
|
+
max_workers: int = 4,
|
129
|
+
image_prefix: str = "image_",
|
130
|
+
) -> None:
|
131
|
+
"""
|
132
|
+
Download images for given points using the specified style
|
133
|
+
|
134
|
+
Args:
|
135
|
+
gdf_points: GeoDataFrame containing bounding box polygons
|
136
|
+
output_dir: Directory to save images
|
137
|
+
image_size: Tuple of (width, height) for output images
|
138
|
+
max_workers: Maximum number of concurrent downloads
|
139
|
+
image_prefix: Prefix for output image names
|
140
|
+
"""
|
141
|
+
output_dir = Path(output_dir)
|
142
|
+
# self.data_store.makedirs(str(output_dir), exist_ok=True)
|
143
|
+
|
144
|
+
image_size_str = f"{image_size[0]}x{image_size[1]}"
|
145
|
+
total_images = len(gdf)
|
146
|
+
|
147
|
+
self.logger.info(
|
148
|
+
f"Downloading {total_images} images with size {image_size_str}..."
|
149
|
+
)
|
150
|
+
|
151
|
+
def download_image(idx: Any, bounds: Tuple[float, float, float, float]) -> bool:
|
152
|
+
file_name = f"{image_prefix}{idx}.png"
|
153
|
+
url = self._construct_url(bounds, image_size_str)
|
154
|
+
success = self._download_single_image(url, output_dir / file_name)
|
155
|
+
return success
|
156
|
+
|
157
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
158
|
+
futures = [
|
159
|
+
executor.submit(download_image, row.Index, row.geometry.bounds)
|
160
|
+
for row in gdf.itertuples()
|
161
|
+
]
|
162
|
+
|
163
|
+
successful_downloads = 0
|
164
|
+
with tqdm(total=total_images) as pbar:
|
165
|
+
for future in as_completed(futures):
|
166
|
+
if future.result():
|
167
|
+
successful_downloads += 1
|
168
|
+
pbar.update(1)
|
169
|
+
|
170
|
+
self.logger.info(
|
171
|
+
f"Successfully downloaded {successful_downloads}/{total_images} images!"
|
172
|
+
)
|
173
|
+
|
174
|
+
def download_images_by_coordinates(
|
175
|
+
self,
|
176
|
+
data: Union[pd.DataFrame, List[Tuple[float, float]]],
|
177
|
+
res_meters_pixel: float,
|
178
|
+
output_dir: Union[str, Path],
|
179
|
+
image_size: Tuple[int, int] = (512, 512),
|
180
|
+
max_workers: int = 4,
|
181
|
+
image_prefix: str = "image_",
|
182
|
+
) -> None:
|
183
|
+
"""
|
184
|
+
Download images for given coordinates by creating bounded boxes around points
|
185
|
+
|
186
|
+
Args:
|
187
|
+
data: Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples
|
188
|
+
res_meters_pixel: Size of the bounding box in meters (creates a square)
|
189
|
+
output_dir: Directory to save images
|
190
|
+
image_size: Tuple of (width, height) for output images
|
191
|
+
max_workers: Maximum number of concurrent downloads
|
192
|
+
image_prefix: Prefix for output image names
|
193
|
+
"""
|
194
|
+
|
195
|
+
if isinstance(data, pd.DataFrame):
|
196
|
+
coordinates_df = data
|
197
|
+
else:
|
198
|
+
coordinates_df = pd.DataFrame(data, columns=["latitude", "longitude"])
|
199
|
+
|
200
|
+
gdf = convert_to_geodataframe(coordinates_df)
|
201
|
+
|
202
|
+
buffered_gdf = buffer_geodataframe(
|
203
|
+
gdf, res_meters_pixel / 2, cap_style="square"
|
204
|
+
)
|
205
|
+
|
206
|
+
self.download_images_by_bounds(
|
207
|
+
buffered_gdf, output_dir, image_size, max_workers, image_prefix
|
208
|
+
)
|