giga-spatial 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/METADATA +30 -4
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/RECORD +22 -20
- gigaspatial/__init__.py +1 -1
- gigaspatial/config.py +1 -0
- gigaspatial/core/io/adls_data_store.py +104 -11
- gigaspatial/core/io/local_data_store.py +8 -0
- gigaspatial/generators/poi.py +226 -82
- gigaspatial/generators/zonal/base.py +41 -28
- gigaspatial/generators/zonal/geometry.py +91 -41
- gigaspatial/grid/h3.py +417 -0
- gigaspatial/grid/mercator_tiles.py +1 -1
- gigaspatial/handlers/base.py +22 -8
- gigaspatial/handlers/ghsl.py +22 -8
- gigaspatial/handlers/giga.py +9 -4
- gigaspatial/handlers/healthsites.py +350 -0
- gigaspatial/handlers/osm.py +325 -105
- gigaspatial/handlers/worldpop.py +228 -9
- gigaspatial/processing/geo.py +11 -6
- gigaspatial/processing/tif_processor.py +1183 -496
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/WHEEL +0 -0
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/licenses/LICENSE +0 -0
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/top_level.txt +0 -0
gigaspatial/handlers/ghsl.py
CHANGED
@@ -610,19 +610,27 @@ class GHSLDataReader(BaseHandlerReader):
|
|
610
610
|
super().__init__(config=config, data_store=data_store, logger=logger)
|
611
611
|
|
612
612
|
def load_from_paths(
|
613
|
-
self,
|
614
|
-
|
613
|
+
self,
|
614
|
+
source_data_path: List[Union[str, Path]],
|
615
|
+
merge_rasters: bool = False,
|
616
|
+
**kwargs,
|
617
|
+
) -> Union[List[TifProcessor], TifProcessor]:
|
615
618
|
"""
|
616
619
|
Load TifProcessors from GHSL dataset.
|
617
620
|
Args:
|
618
621
|
source_data_path: List of file paths to load
|
622
|
+
merge_rasters: If True, all rasters will be merged into a single TifProcessor.
|
623
|
+
Defaults to False.
|
619
624
|
Returns:
|
620
|
-
List[TifProcessor]: List of TifProcessor objects for accessing the raster data
|
625
|
+
Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects for accessing the raster data or a single
|
626
|
+
TifProcessor if merge_rasters is True.
|
621
627
|
"""
|
622
|
-
return self._load_raster_data(
|
628
|
+
return self._load_raster_data(
|
629
|
+
raster_paths=source_data_path, merge_rasters=merge_rasters
|
630
|
+
)
|
623
631
|
|
624
|
-
def load(self, source, **kwargs):
|
625
|
-
return super().load(source=source, file_ext=".tif")
|
632
|
+
def load(self, source, merge_rasters: bool = False, **kwargs):
|
633
|
+
return super().load(source=source, file_ext=".tif", merge_rasters=merge_rasters)
|
626
634
|
|
627
635
|
|
628
636
|
class GHSLDataHandler(BaseHandler):
|
@@ -763,6 +771,7 @@ class GHSLDataHandler(BaseHandler):
|
|
763
771
|
List[Union[str, Path]], # list of paths
|
764
772
|
],
|
765
773
|
ensure_available: bool = True,
|
774
|
+
merge_rasters: bool = False,
|
766
775
|
**kwargs,
|
767
776
|
):
|
768
777
|
return super().load_data(
|
@@ -771,6 +780,7 @@ class GHSLDataHandler(BaseHandler):
|
|
771
780
|
file_ext=".tif",
|
772
781
|
extract=True,
|
773
782
|
file_pattern=r".*\.tif$",
|
783
|
+
merge_rasters=merge_rasters,
|
774
784
|
**kwargs,
|
775
785
|
)
|
776
786
|
|
@@ -801,8 +811,10 @@ class GHSLDataHandler(BaseHandler):
|
|
801
811
|
tif_processors = self.load_data(
|
802
812
|
source=source, ensure_available=ensure_available, **kwargs
|
803
813
|
)
|
814
|
+
if isinstance(tif_processors, TifProcessor):
|
815
|
+
return tif_processors.to_dataframe(**kwargs)
|
804
816
|
return pd.concat(
|
805
|
-
[tp.to_dataframe() for tp in tif_processors], ignore_index=True
|
817
|
+
[tp.to_dataframe(**kwargs) for tp in tif_processors], ignore_index=True
|
806
818
|
)
|
807
819
|
|
808
820
|
def load_into_geodataframe(
|
@@ -832,8 +844,10 @@ class GHSLDataHandler(BaseHandler):
|
|
832
844
|
tif_processors = self.load_data(
|
833
845
|
source=source, ensure_available=ensure_available, **kwargs
|
834
846
|
)
|
847
|
+
if isinstance(tif_processors, TifProcessor):
|
848
|
+
return tif_processors.to_geodataframe(**kwargs)
|
835
849
|
return pd.concat(
|
836
|
-
[tp.to_geodataframe() for tp in tif_processors], ignore_index=True
|
850
|
+
[tp.to_geodataframe(**kwargs) for tp in tif_processors], ignore_index=True
|
837
851
|
)
|
838
852
|
|
839
853
|
def get_available_data_info(
|
gigaspatial/handlers/giga.py
CHANGED
@@ -8,6 +8,7 @@ from shapely.geometry import Point
|
|
8
8
|
import pycountry
|
9
9
|
from typing import Optional, Union
|
10
10
|
import logging
|
11
|
+
import geopandas as gpd
|
11
12
|
|
12
13
|
from gigaspatial.config import config as global_config
|
13
14
|
|
@@ -40,11 +41,14 @@ class GigaSchoolLocationFetcher:
|
|
40
41
|
if self.logger is None:
|
41
42
|
self.logger = global_config.get_logger(self.__class__.__name__)
|
42
43
|
|
43
|
-
def fetch_locations(
|
44
|
+
def fetch_locations(
|
45
|
+
self, process_geospatial: bool = False, **kwargs
|
46
|
+
) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
|
44
47
|
"""
|
45
48
|
Fetch and process school locations.
|
46
49
|
|
47
50
|
Args:
|
51
|
+
process_geospatial (bool): Whether to process geospatial data and return a GeoDataFrame. Defaults to False.
|
48
52
|
**kwargs: Additional parameters for customization
|
49
53
|
- page_size: Override default page size
|
50
54
|
- sleep_time: Override default sleep time between requests
|
@@ -122,11 +126,12 @@ class GigaSchoolLocationFetcher:
|
|
122
126
|
|
123
127
|
df = pd.DataFrame(all_data)
|
124
128
|
|
125
|
-
|
129
|
+
if process_geospatial:
|
130
|
+
df = self._process_geospatial_data(df)
|
126
131
|
|
127
132
|
return df
|
128
133
|
|
129
|
-
def _process_geospatial_data(self, df: pd.DataFrame) ->
|
134
|
+
def _process_geospatial_data(self, df: pd.DataFrame) -> gpd.GeoDataFrame:
|
130
135
|
"""
|
131
136
|
Process and enhance the DataFrame with geospatial information.
|
132
137
|
|
@@ -144,7 +149,7 @@ class GigaSchoolLocationFetcher:
|
|
144
149
|
)
|
145
150
|
self.logger.info(f"Created geometry for all {len(df)} records")
|
146
151
|
|
147
|
-
return df
|
152
|
+
return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
|
148
153
|
|
149
154
|
|
150
155
|
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
@@ -0,0 +1,350 @@
|
|
1
|
+
import requests
|
2
|
+
import pandas as pd
|
3
|
+
import geopandas as gpd
|
4
|
+
import time
|
5
|
+
from typing import List, Optional, Union, Tuple
|
6
|
+
from pydantic.dataclasses import dataclass, Field
|
7
|
+
from pydantic import ConfigDict
|
8
|
+
import pycountry
|
9
|
+
|
10
|
+
from gigaspatial.config import config
|
11
|
+
from gigaspatial.handlers import OSMLocationFetcher
|
12
|
+
|
13
|
+
|
14
|
+
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
15
|
+
class HealthSitesFetcher:
|
16
|
+
"""
|
17
|
+
Fetch and process health facility location data from the Healthsites.io API.
|
18
|
+
"""
|
19
|
+
|
20
|
+
country: Optional[str] = Field(default=None, description="Country to filter")
|
21
|
+
api_url: str = Field(
|
22
|
+
default="https://healthsites.io/api/v3/facilities/",
|
23
|
+
description="Base URL for the Healthsites API",
|
24
|
+
)
|
25
|
+
api_key: str = config.HEALTHSITES_API_KEY
|
26
|
+
extent: Optional[Tuple[float, float, float, float]] = Field(
|
27
|
+
default=None, description="Bounding box as (minLng, minLat, maxLng, maxLat)"
|
28
|
+
)
|
29
|
+
page_size: int = Field(default=100, description="Number of records per API page")
|
30
|
+
flat_properties: bool = Field(
|
31
|
+
default=True, description="Show properties in flat format"
|
32
|
+
)
|
33
|
+
tag_format: str = Field(default="osm", description="Tag format (osm/hxl)")
|
34
|
+
output_format: str = Field(
|
35
|
+
default="geojson", description="Output format (json/geojson)"
|
36
|
+
)
|
37
|
+
sleep_time: float = Field(
|
38
|
+
default=0.2, description="Sleep time between API requests"
|
39
|
+
)
|
40
|
+
|
41
|
+
def __post_init__(self):
|
42
|
+
self.logger = config.get_logger(self.__class__.__name__)
|
43
|
+
# Convert country code to OSM English name if provided
|
44
|
+
if self.country:
|
45
|
+
self.country = self._convert_country(self.country)
|
46
|
+
|
47
|
+
def fetch_facilities(self, **kwargs) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
|
48
|
+
"""
|
49
|
+
Fetch and process health facility locations.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
**kwargs: Additional parameters for customization
|
53
|
+
- country: Override country filter
|
54
|
+
- extent: Override extent filter
|
55
|
+
- from_date: Get data modified from this timestamp (datetime or string)
|
56
|
+
- to_date: Get data modified to this timestamp (datetime or string)
|
57
|
+
- page_size: Override default page size
|
58
|
+
- sleep_time: Override default sleep time between requests
|
59
|
+
- max_pages: Limit the number of pages to fetch
|
60
|
+
- output_format: Override output format ('json' or 'geojson')
|
61
|
+
- flat_properties: Override flat properties setting
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
Union[pd.DataFrame, gpd.GeoDataFrame]: Health facilities data.
|
65
|
+
Returns GeoDataFrame for geojson format, DataFrame for json format.
|
66
|
+
"""
|
67
|
+
# Override defaults with kwargs if provided
|
68
|
+
country = kwargs.get("country", self.country)
|
69
|
+
extent = kwargs.get("extent", self.extent)
|
70
|
+
from_date = kwargs.get("from_date", None)
|
71
|
+
to_date = kwargs.get("to_date", None)
|
72
|
+
page_size = kwargs.get("page_size", self.page_size)
|
73
|
+
sleep_time = kwargs.get("sleep_time", self.sleep_time)
|
74
|
+
max_pages = kwargs.get("max_pages", None)
|
75
|
+
output_format = kwargs.get("output_format", self.output_format)
|
76
|
+
flat_properties = kwargs.get("flat_properties", self.flat_properties)
|
77
|
+
|
78
|
+
# Convert country if provided in kwargs
|
79
|
+
if country:
|
80
|
+
country = self._convert_country(country)
|
81
|
+
|
82
|
+
# Prepare base parameters
|
83
|
+
base_params = {
|
84
|
+
"api-key": self.api_key,
|
85
|
+
"tag-format": self.tag_format,
|
86
|
+
"output": output_format,
|
87
|
+
}
|
88
|
+
|
89
|
+
# Only add flat-properties if True (don't send it as false, as that makes it flat anyway)
|
90
|
+
if flat_properties:
|
91
|
+
base_params["flat-properties"] = "true"
|
92
|
+
|
93
|
+
# Add optional filters
|
94
|
+
if country:
|
95
|
+
base_params["country"] = country
|
96
|
+
|
97
|
+
if extent:
|
98
|
+
if len(extent) != 4:
|
99
|
+
raise ValueError(
|
100
|
+
"Extent must be a tuple of 4 values: (minLng, minLat, maxLng, maxLat)"
|
101
|
+
)
|
102
|
+
base_params["extent"] = ",".join(map(str, extent))
|
103
|
+
|
104
|
+
if from_date:
|
105
|
+
base_params["from"] = self._format_timestamp(from_date)
|
106
|
+
|
107
|
+
if to_date:
|
108
|
+
base_params["to"] = self._format_timestamp(to_date)
|
109
|
+
|
110
|
+
all_data = []
|
111
|
+
page = 1
|
112
|
+
|
113
|
+
self.logger.info(
|
114
|
+
f"Starting to fetch health facilities for country: {country or 'all countries'}"
|
115
|
+
)
|
116
|
+
self.logger.info(
|
117
|
+
f"Output format: {output_format}, Flat properties: {flat_properties}"
|
118
|
+
)
|
119
|
+
|
120
|
+
while True:
|
121
|
+
# Check if we've reached max_pages limit
|
122
|
+
if max_pages and page > max_pages:
|
123
|
+
self.logger.info(f"Reached maximum pages limit: {max_pages}")
|
124
|
+
break
|
125
|
+
|
126
|
+
# Add page parameter
|
127
|
+
params = base_params.copy()
|
128
|
+
params["page"] = page
|
129
|
+
|
130
|
+
try:
|
131
|
+
self.logger.debug(f"Fetching page {page} with params: {params}")
|
132
|
+
response = requests.get(self.api_url, params=params)
|
133
|
+
response.raise_for_status()
|
134
|
+
|
135
|
+
parsed = response.json()
|
136
|
+
|
137
|
+
# Handle different response structures based on output format
|
138
|
+
if output_format == "geojson":
|
139
|
+
# GeoJSON returns FeatureCollection with features list
|
140
|
+
data = parsed.get("features", [])
|
141
|
+
else:
|
142
|
+
# JSON returns direct list
|
143
|
+
data = parsed if isinstance(parsed, list) else []
|
144
|
+
|
145
|
+
except requests.exceptions.RequestException as e:
|
146
|
+
self.logger.error(f"Request failed on page {page}: {e}")
|
147
|
+
break
|
148
|
+
except ValueError as e:
|
149
|
+
self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
|
150
|
+
break
|
151
|
+
|
152
|
+
# Check if we got any data
|
153
|
+
if not data or not isinstance(data, list):
|
154
|
+
self.logger.info(f"No data on page {page}. Stopping.")
|
155
|
+
break
|
156
|
+
|
157
|
+
all_data.extend(data)
|
158
|
+
self.logger.info(f"Fetched page {page} with {len(data)} records")
|
159
|
+
|
160
|
+
# If we got fewer records than page_size, we've reached the end
|
161
|
+
if len(data) < page_size:
|
162
|
+
self.logger.info("Reached end of data (partial page received)")
|
163
|
+
break
|
164
|
+
|
165
|
+
page += 1
|
166
|
+
|
167
|
+
# Sleep to be respectful to the API
|
168
|
+
if sleep_time > 0:
|
169
|
+
time.sleep(sleep_time)
|
170
|
+
|
171
|
+
self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
|
172
|
+
|
173
|
+
# Convert to DataFrame/GeoDataFrame based on format
|
174
|
+
if not all_data:
|
175
|
+
self.logger.warning("No data fetched, returning empty DataFrame")
|
176
|
+
if output_format == "geojson":
|
177
|
+
return gpd.GeoDataFrame()
|
178
|
+
return pd.DataFrame()
|
179
|
+
|
180
|
+
if output_format == "geojson":
|
181
|
+
# Use GeoDataFrame.from_features for GeoJSON format
|
182
|
+
gdf = gpd.GeoDataFrame.from_features(all_data)
|
183
|
+
self.logger.info(f"Created GeoDataFrame with {len(gdf)} records")
|
184
|
+
return gdf
|
185
|
+
else:
|
186
|
+
# For JSON format, handle nested structure if flat_properties is False
|
187
|
+
if not flat_properties:
|
188
|
+
df = self._process_json_with_centroid(all_data)
|
189
|
+
else:
|
190
|
+
df = pd.DataFrame(all_data)
|
191
|
+
|
192
|
+
self.logger.info(f"Created DataFrame with {len(df)} records")
|
193
|
+
return df
|
194
|
+
|
195
|
+
def fetch_statistics(self, **kwargs) -> dict:
|
196
|
+
"""
|
197
|
+
Fetch statistics for health facilities.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
**kwargs: Same filtering parameters as fetch_facilities
|
201
|
+
|
202
|
+
Returns:
|
203
|
+
dict: Statistics data
|
204
|
+
"""
|
205
|
+
country = kwargs.get("country", self.country)
|
206
|
+
extent = kwargs.get("extent", self.extent)
|
207
|
+
from_date = kwargs.get("from_date", None)
|
208
|
+
to_date = kwargs.get("to_date", None)
|
209
|
+
|
210
|
+
# Convert country if provided
|
211
|
+
if country:
|
212
|
+
country = self._convert_country(country)
|
213
|
+
|
214
|
+
params = {
|
215
|
+
"api-key": self.api_key,
|
216
|
+
}
|
217
|
+
|
218
|
+
# Add optional filters
|
219
|
+
if country:
|
220
|
+
params["country"] = country
|
221
|
+
if extent:
|
222
|
+
params["extent"] = ",".join(map(str, extent))
|
223
|
+
if from_date:
|
224
|
+
params["from"] = self._format_timestamp(from_date)
|
225
|
+
if to_date:
|
226
|
+
params["to"] = self._format_timestamp(to_date)
|
227
|
+
|
228
|
+
try:
|
229
|
+
response = requests.get(f"{self.api_url}/statistic/", params=params)
|
230
|
+
response.raise_for_status()
|
231
|
+
return response.json()
|
232
|
+
except requests.exceptions.RequestException as e:
|
233
|
+
self.logger.error(f"Request failed for statistics: {e}")
|
234
|
+
raise
|
235
|
+
|
236
|
+
def fetch_facility_by_id(self, osm_type: str, osm_id: str) -> dict:
|
237
|
+
"""
|
238
|
+
Fetch a specific facility by OSM type and ID.
|
239
|
+
|
240
|
+
Args:
|
241
|
+
osm_type: OSM type (node, way, relation)
|
242
|
+
osm_id: OSM ID
|
243
|
+
|
244
|
+
Returns:
|
245
|
+
dict: Facility details
|
246
|
+
"""
|
247
|
+
params = {"api-key": self.api_key}
|
248
|
+
|
249
|
+
try:
|
250
|
+
url = f"{self.api_url}/{osm_type}/{osm_id}"
|
251
|
+
response = requests.get(url, params=params)
|
252
|
+
response.raise_for_status()
|
253
|
+
return response.json()
|
254
|
+
except requests.exceptions.RequestException as e:
|
255
|
+
self.logger.error(f"Request failed for facility {osm_type}/{osm_id}: {e}")
|
256
|
+
raise
|
257
|
+
|
258
|
+
def _create_dataframe(self, data: List[dict]) -> pd.DataFrame:
|
259
|
+
"""
|
260
|
+
Create DataFrame from API response data.
|
261
|
+
|
262
|
+
Args:
|
263
|
+
data: List of facility records
|
264
|
+
|
265
|
+
Returns:
|
266
|
+
pd.DataFrame: Processed DataFrame
|
267
|
+
"""
|
268
|
+
if self.output_format == "geojson":
|
269
|
+
# Handle GeoJSON format
|
270
|
+
records = []
|
271
|
+
for feature in data:
|
272
|
+
record = feature.get("properties", {}).copy()
|
273
|
+
geometry = feature.get("geometry", {})
|
274
|
+
coordinates = geometry.get("coordinates", [])
|
275
|
+
|
276
|
+
if coordinates and len(coordinates) >= 2:
|
277
|
+
record["longitude"] = coordinates[0]
|
278
|
+
record["latitude"] = coordinates[1]
|
279
|
+
|
280
|
+
records.append(record)
|
281
|
+
return pd.DataFrame(records)
|
282
|
+
else:
|
283
|
+
# Handle regular JSON format
|
284
|
+
return pd.DataFrame(data)
|
285
|
+
|
286
|
+
def _process_json_with_centroid(self, data: List[dict]) -> pd.DataFrame:
|
287
|
+
"""
|
288
|
+
Process JSON data to flatten 'attributes' and 'centroid' fields,
|
289
|
+
and extract longitude/latitude from centroid.
|
290
|
+
|
291
|
+
Args:
|
292
|
+
data: List of facility records, where each record might contain
|
293
|
+
nested 'attributes' and 'centroid' dictionaries.
|
294
|
+
|
295
|
+
Returns:
|
296
|
+
pd.DataFrame: Processed DataFrame with flattened data.
|
297
|
+
"""
|
298
|
+
processed_records = []
|
299
|
+
for record in data:
|
300
|
+
new_record = {}
|
301
|
+
|
302
|
+
# Flatten top-level keys
|
303
|
+
for key, value in record.items():
|
304
|
+
if key not in ["attributes", "centroid"]:
|
305
|
+
new_record[key] = value
|
306
|
+
|
307
|
+
# Flatten 'attributes'
|
308
|
+
attributes = record.get("attributes", {})
|
309
|
+
for attr_key, attr_value in attributes.items():
|
310
|
+
new_record[f"{attr_key}"] = attr_value
|
311
|
+
|
312
|
+
# Extract centroid coordinates
|
313
|
+
centroid = record.get("centroid", {})
|
314
|
+
coordinates = centroid.get("coordinates", [])
|
315
|
+
if coordinates and len(coordinates) == 2:
|
316
|
+
new_record["longitude"] = coordinates[0]
|
317
|
+
new_record["latitude"] = coordinates[1]
|
318
|
+
else:
|
319
|
+
new_record["longitude"] = None
|
320
|
+
new_record["latitude"] = None
|
321
|
+
|
322
|
+
processed_records.append(new_record)
|
323
|
+
|
324
|
+
return pd.DataFrame(processed_records)
|
325
|
+
|
326
|
+
def _convert_country(self, country: str) -> str:
|
327
|
+
try:
|
328
|
+
# First convert to ISO3 format if needed
|
329
|
+
country_obj = pycountry.countries.lookup(country)
|
330
|
+
iso3_code = country_obj.alpha_3
|
331
|
+
|
332
|
+
# Get OSM English name using OSMLocationFetcher
|
333
|
+
osm_data = OSMLocationFetcher.get_osm_countries(iso3_code=iso3_code)
|
334
|
+
osm_name_en = osm_data.get("name:en")
|
335
|
+
|
336
|
+
if not osm_name_en:
|
337
|
+
raise ValueError(
|
338
|
+
f"Could not find OSM English name for country: {country}"
|
339
|
+
)
|
340
|
+
|
341
|
+
self.logger.info(
|
342
|
+
f"Converted country code to OSM English name: {osm_name_en}"
|
343
|
+
)
|
344
|
+
|
345
|
+
return osm_name_en
|
346
|
+
|
347
|
+
except LookupError:
|
348
|
+
raise ValueError(f"Invalid country code provided: {country}")
|
349
|
+
except Exception as e:
|
350
|
+
raise ValueError(f"Failed to get OSM English name: {e}")
|